Skip to content

Commit 1a06373

Browse files
committed
Invent time travel (part 2)
1 parent e5f63e8 commit 1a06373

File tree

1 file changed

+138
-85
lines changed

1 file changed

+138
-85
lines changed

scrape_tags.py

+138-85
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,23 @@
11
import os
22
import requests
3+
import collections
34
import csv
45
import time
56
import datetime
67

8+
class Complete(Exception): pass
9+
710
csv_filename = input('Output filename: ')
811
minimum_count = input('Minimum tag count (> 50 is preferable): ')
912
dashes = input('replace \'_\' with \'-\'? (often better for prompt following) (Y/n): ')
1013
exclude = input('enter categories to exclude: (general,artist,copyright,character,post) (press enter for none): \n')
11-
alias = input('Include aliases? (Only supported in tag-complete) (Y/n): ')
1214
boards = input('Enter boards to scrape danbooru(d), e621(e), both(de) (default: danbooru): ')
1315
date = input('Enter cutoff date. ex: 2024-09-03 for september 3rd 2024: ')
1416
try:
1517
max_date = datetime.datetime.strptime(date.strip()[:10], "%Y-%m-%d")
1618
print(f"Using date: {max_date}")
1719
except:
18-
max_date = str(datetime.datetime.now())[:10]
20+
max_date = datetime.datetime.now()
1921
print(f"Using todays date: {max_date}")
2022

2123
boards = boards.lower()
@@ -41,92 +43,130 @@
4143
dashes = 'y'
4244
csv_filename += '-temp'
4345

44-
if not 'n' in alias.lower():
45-
alias = 'y'
46-
4746
if not minimum_count.isdigit():
4847
minimum_count = 50
4948

50-
# Base URLs
51-
dan_base_url = 'https://danbooru.donmai.us/tags.json?limit=1000&search[hide_empty]=yes&search[is_deprecated]=no&search[order]=count'
52-
dan_alias_url = 'https://danbooru.donmai.us/tag_aliases.json?commit=Search&search%5Bconsequent_name_matches%5D='
49+
# Base URLs without the page parameter
50+
base_url = 'https://danbooru.donmai.us/tags.json?limit=1000&search[hide_empty]=yes&search[is_deprecated]=no&search[order]=count'
51+
alias_url = 'https://danbooru.donmai.us/tag_aliases.json?commit=Search&limit=1000&search[order]=tag_count'
5352
e6_base_url = 'https://e621.net/tags.json?limit=1000&search[hide_empty]=yes&search[is_deprecated]=no&search[order]=count'
54-
e6_alias_url = 'https://e621.net/tag_aliases.json?commit=Search&search%5Bconsequent_name%5D='
53+
e6_alias_url = 'https://e621.net/tag_aliases.json?commit=Search&limit=1000&search[order]=tag_count'
5554

5655
session = requests.Session()
5756

58-
class Complete(Exception): pass
59-
60-
def get_aliases(tags, name, url, max_date, session):
61-
url = url + name
62-
while True:
63-
response = session.get(url,headers={"User-Agent": "tag-list/3.0"})
64-
if response.status_code == 200:
65-
aliases = {}
57+
dan_aliases = collections.defaultdict(str)
58+
e6_aliases = collections.defaultdict(str)
59+
60+
61+
def backdate(tags, aliases, date):
62+
print(f"Clearing older aliases")
63+
filtered_aliases = {}
64+
for key in aliases:
65+
kept = []
66+
for item in aliases[key]:
67+
entry_date = datetime.datetime.strptime(item[1][:10], "%Y-%m-%d")
68+
if entry_date <= date:
69+
kept += [item[0]]
70+
filtered_aliases[key] = kept
71+
72+
#print(filtered_aliases)
73+
74+
for key in list(tags.keys()): # prevents size change error
75+
#print(f"Processing {key}")
76+
if datetime.datetime.strptime(tags[key][2][:10], "%Y-%m-%d") > date:
77+
try:
78+
new_key = filtered_aliases[key].pop(0)
79+
value = tags.pop(key)
80+
tags[new_key] = value
81+
except Exception as e:
82+
#print(f"{key} removed\n{e}")
83+
pass
84+
85+
# add aliases
86+
for key in filtered_aliases:
87+
try:
88+
alias_string = ",".join(filtered_aliases[key])
89+
tags[key] += [alias_string]
90+
except:
91+
#print(f"{key} probably doesn't exist in one list or the other, likely a cuttoff thing")
92+
pass
93+
94+
95+
def get_aliases(url,type):
96+
# create alias dictionary
97+
try:
98+
aliases = collections.defaultdict(list)
99+
for page in range(1,5):
100+
# Update the URL with the current page
101+
url = f'{url}&page={page}'
102+
# Fetch the JSON data
103+
while True:
104+
response = session.get(url,headers={"User-Agent": "tag-list/2.0"})
105+
if response.status_code == 200:
106+
break
107+
else:
108+
print(f"Couldn't reach server, Status: {response.status_code}.\nRetrying in 5 seconds")
109+
time.sleep(5)
66110
data = response.json()
111+
# Break the loop if the data is empty (no more tags to fetch)
112+
if not data:
113+
print(f'No more data found at page {page}. Stopping.', flush=True)
114+
break
67115
for item in data:
68-
aliases[item['antecedent_name']] = item['antecedent_name'],item['created_at']
69-
70-
aliases = {key: value for key, value in aliases.items()
71-
if datetime.datetime.strptime(value[1][:10], "%Y-%m-%d") <= max_date}
72-
73-
if datetime.datetime.strptime(tags[name][2][:10], "%Y-%m-%d") >= max_date:
74-
try:
75-
previous_key = tags.pop(name)
76-
tags[aliases[0][0]] = previous_key
77-
lst_alias = []
78-
for index in range(1, len(alias)):
79-
lst_alias += alias[index][0]
80-
tags[aliases[0][0]] += [lst_alias]
81-
dan_tags[aliases[0][0]] += [''] # safety index
82-
return
83-
except: # if there are no aliases for a tag which must be removed
84-
print(f"Removed {name}")
85-
return
86-
dan_tags[name] += [''] # safety index
87-
else:
88-
print("Failed to get aliases, likely a connection error.\nRetrying in 5 seconds...")
89-
116+
if type == "e": # danbooru doesn't have post counts for aliases
117+
if int(item['post_count']) < int(minimum_count):
118+
raise Complete
119+
aliases[item['consequent_name']] += [[item['antecedent_name'],item['created_at']]]
120+
print(f'Page {page} aliases processed.', flush=True)
121+
time.sleep(0.1) # avoid cloudflare rate limit
122+
except(Complete):
123+
print("reached the post threshold")
124+
return(aliases)
90125

126+
#######
91127
if "d" in boards:
92128
dan_tags = {}
93129
try:
94-
for page in range(1, 1001):
130+
for page in range(1, 5):
95131
# Update the URL with the current page
96-
url = f'{dan_base_url}&page={page}'
132+
url = f'{base_url}&page={page}'
97133
# Fetch the JSON data
98-
response = session.get(url,headers={"User-Agent": "tag-list/3.0"})
99-
# Check if the request was successful
100-
if response.status_code == 200:
101-
data = response.json()
102-
# Break the loop if the data is empty (no more tags to fetch)
103-
if not data:
104-
print(f'No more data found at page {page}. Stopping.', flush=True)
134+
while True:
135+
response = session.get(url,headers={"User-Agent": "tag-list/2.0"})
136+
if response.status_code == 200:
105137
break
106-
107-
for item in data:
108-
if int(item['post_count']) < int(minimum_count): # break if below minimum count
109-
raise Complete
110-
if not str(item['category']) in excluded:
111-
dan_tags[item['name']] = [item['category'],item['post_count'],item['created_at']]
112-
get_aliases(dan_tags, item['name'], dan_alias_url, max_date, session)
113-
else:
114-
print(f'Failed to fetch data for page {page}. HTTP Status Code: {response.status_code}', flush=True)
138+
else:
139+
print(f"Couldn't reach server, Status: {response.status_code}.\nRetrying in 5 seconds")
140+
time.sleep(5)
141+
data = response.json()
142+
# Break the loop if the data is empty (no more tags to fetch)
143+
if not data:
144+
print(f'No more data found at page {page}. Stopping.', flush=True)
115145
break
146+
147+
for item in data:
148+
if int(item['post_count']) < int(minimum_count): # break if below minimum count
149+
raise Complete
150+
if not str(item['category']) in excluded:
151+
dan_tags[item['name']] = [item['category'],item['post_count'],item['created_at']]
116152
print(f'Danbooru page {page} processed.', flush=True)
117-
# Sleep for 0.5 second because we have places to be
118-
time.sleep(0.5)
153+
time.sleep(0.1) # avoid cloudflare rate limit
119154
except(Complete):
120155
pass
121156

157+
if "d" in boards:
158+
dan_aliases = get_aliases(alias_url, "d")
159+
backdate(dan_tags,dan_aliases,max_date)
160+
161+
122162
if "e" in boards:
123163
e6_tags = {}
124164
try:
125-
for page in range(1, 1001):
165+
for page in range(1, 2):
126166
# Update the URL with the current page
127167
url = f'{e6_base_url}&page={page}'
128168
# Fetch the JSON data
129-
response = session.get(url,headers={"User-Agent": "tag-list/3.0"})
169+
response = session.get(url,headers={"User-Agent": "tag-list/2.0"})
130170
# Check if the request was successful
131171
if response.status_code == 200:
132172
data = response.json()
@@ -140,54 +180,67 @@ def get_aliases(tags, name, url, max_date, session):
140180
raise Complete
141181
if not str(item['category']) in excluded:
142182
e6_tags[item['name']] = [item['category'],item['post_count'],item['created_at']]
143-
get_aliases(e6_tags, item['name'], e6_alias_url, max_date, session)
144183
else:
145184
print(f'Failed to fetch data for page {page}. HTTP Status Code: {response.status_code}', flush=True)
146185
break
147-
print(f'e6 page {page} processed.', flush=True)
148-
# Sleep for 0.5 second because we have places to be
149-
time.sleep(0.5)
150-
except(Complete):
151-
pass
186+
print(f'e621 page {page} processed.', flush=True)
187+
# e6 gets mad if you make more than 1 per second
188+
time.sleep(1)
189+
except Complete:
190+
print(f'All tags with {minimum_count} posts or greater have been scraped.')
191+
192+
# e6 tags are fucked, a proper solution would take ~10 hours per list and I'm not going that far for furries
193+
#if "e" in boards:
194+
# e6_aliases = get_aliases(e6_alias_url, "e")
195+
# backdate(e6_tags,e6_aliases,max_date)
196+
152197

153198
# Merge boards
154199
if ("d" in boards) and ("e" in boards):
155200
for tag in dan_tags:
156201
if tag in e6_tags:
157202
e6_tags[tag][1] += dan_tags[tag][1] # combined count
203+
"""if e6_tags[tag][2] != None and dan_tags[tag][2] != None:
204+
if e6_tags[tag][2] == "":
205+
e6_tags[tag][2] += dan_tags[tag][2] # aliases
206+
else:
207+
e6_tags[tag][2] += "," + dan_tags[tag][2]"""
158208
dan_tags.update(e6_tags)
159209
full_tags = dan_tags
160210
elif "d" in boards:
161211
full_tags = dan_tags
162212
else:
163213
full_tags = e6_tags
164214

215+
# Open a file to write
165216
print("writing to file")
166217
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
167218
writer = csv.writer(file)
168219
# danbooru
169220
# Write the data
170221
for key, value in full_tags.items():
171222
if not str(value[0]) in excluded:
172-
if alias == 'n':
173-
writer.writerow([key,value[0],value[1],''])
174-
else:
223+
try:
175224
writer.writerow([key,value[0],value[1],value[3]])
225+
except:
226+
writer.writerow([key,value[0],value[1],'']) #too lazy for a proper fix
176227
# Explicitly flush the data to the file
177228
file.close()
178229

179-
if dashes == 'y':
180-
print(f'Replacing \'_\' with \'-\'')
181-
with open(csv_filename, 'r', encoding='utf-8') as csvfile:
182-
reader = csv.reader(csvfile)
183-
with open(csv_filename.removesuffix('-temp'), 'w', encoding='utf-8', newline='') as outfile:
184-
writer = csv.writer(outfile)
185-
for row in reader:
186-
if not row[0] in kaomojis:
187-
row[0] = row[0].replace("_", "-")
188-
row[3] = row[3].replace("_", "-")
189-
writer.writerow(row)
190-
outfile.close()
191-
csvfile.close()
192-
os.remove(csv_filename)
193-
csv_filename = csv_filename.removesuffix('-temp')
230+
if dashes == 'y':
231+
print(f'Replacing \'_\' with \'-\'')
232+
with open(csv_filename, 'r', encoding='utf-8') as csvfile:
233+
reader = csv.reader(csvfile)
234+
with open(csv_filename.removesuffix('-temp'), 'w', encoding='utf-8', newline='') as outfile:
235+
writer = csv.writer(outfile)
236+
for row in reader:
237+
if not row[0] in kaomojis:
238+
row[0] = row[0].replace("_", "-")
239+
row[3] = row[3].replace("_", "-")
240+
writer.writerow(row)
241+
outfile.close()
242+
csvfile.close()
243+
os.remove(csv_filename)
244+
csv_filename = csv_filename.removesuffix('-temp')
245+
246+
print(f'Data has been written to {csv_filename}', flush=True)

0 commit comments

Comments
 (0)