-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_ifdb_year.py
More file actions
73 lines (61 loc) · 2.1 KB
/
get_ifdb_year.py
File metadata and controls
73 lines (61 loc) · 2.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import time
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
page = 1
year = 2023
url = f'https://ifdb.org/search?searchfor=published%3A{year}&sortby=&pg={page}'
def parse_search_result(url):
with urllib.request.urlopen(url) as fp:
data = fp.read()
html = data.decode("ISO-8859-1")
soup = BeautifulSoup(html, 'lxml')
all_results = soup.find_all('h3', attrs={'class': 'result'})
all_game_info = []
# get all titles and tuids
for result in all_results:
link = result.find('a')
href = link.attrs['href']
tuid = href.split('=')[1]
game_info = get_game_info(tuid)
print(game_info)
all_game_info.append(game_info)
time.sleep(0.1)
return all_game_info
def get_game_info(tuid):
url = 'https://ifdb.org/viewgame?ifiction&id=' + tuid
with urllib.request.urlopen(url) as fp:
data = fp.read()
xml = BeautifulSoup(data, features='xml')
title = xml.find('title').text
author = xml.find('author').text
game_url = ''
play_url = ''
# TODO: this is messy
urls = xml.find_all('url')
if len(urls) >= 1:
game_url = urls[0].text
if len(urls) >= 2:
if not urls[1].text.startswith('https://ifdb.org'):
play_url = urls[1].text
base_url = 'https://ifdb.org/viewgame?id=' + tuid
# website?
return {'title': title, 'author': author, 'IFDB-link': base_url, 'game-url': game_url, 'playonline-url': play_url}
def get_all_data(year):
page = 1
url = f'https://ifdb.org/search?searchfor=published%3A{year}&sortby=old&pg={page}'
all_results = []
results = parse_search_result(url)
all_results += results
while results:
page += 1
url = f'https://ifdb.org/search?searchfor=published%3A{year}&sortby=old&pg={page}'
results = parse_search_result(url)
all_results += results
time.sleep(0.5)
return all_results
if __name__ == '__main__':
all_results = get_all_data(2023)
df = pd.DataFrame(all_results)
df.to_csv('ifdb_2023_all.csv', sep='\t', index=None)
# select all h3 class="result"