1
1
import os
2
2
import requests
3
+ import collections
3
4
import csv
4
5
import time
5
6
import datetime
6
7
8
+ class Complete (Exception ): pass
9
+
7
10
csv_filename = input ('Output filename: ' )
8
11
minimum_count = input ('Minimum tag count (> 50 is preferable): ' )
9
12
dashes = input ('replace \' _\' with \' -\' ? (often better for prompt following) (Y/n): ' )
10
13
exclude = input ('enter categories to exclude: (general,artist,copyright,character,post) (press enter for none): \n ' )
11
- alias = input ('Include aliases? (Only supported in tag-complete) (Y/n): ' )
12
14
boards = input ('Enter boards to scrape danbooru(d), e621(e), both(de) (default: danbooru): ' )
13
15
date = input ('Enter cutoff date. ex: 2024-09-03 for september 3rd 2024: ' )
14
16
try :
15
17
max_date = datetime .datetime .strptime (date .strip ()[:10 ], "%Y-%m-%d" )
16
18
print (f"Using date: { max_date } " )
17
19
except :
18
- max_date = str ( datetime .datetime .now ())[: 10 ]
20
+ max_date = datetime .datetime .now ()
19
21
print (f"Using todays date: { max_date } " )
20
22
21
23
boards = boards .lower ()
41
43
dashes = 'y'
42
44
csv_filename += '-temp'
43
45
44
- if not 'n' in alias .lower ():
45
- alias = 'y'
46
-
47
46
if not minimum_count .isdigit ():
48
47
minimum_count = 50
49
48
50
- # Base URLs
51
- dan_base_url = 'https://danbooru.donmai.us/tags.json?limit=1000&search[hide_empty]=yes&search[is_deprecated]=no&search[order]=count'
52
- dan_alias_url = 'https://danbooru.donmai.us/tag_aliases.json?commit=Search&search%5Bconsequent_name_matches%5D= '
49
+ # Base URLs without the page parameter
50
+ base_url = 'https://danbooru.donmai.us/tags.json?limit=1000&search[hide_empty]=yes&search[is_deprecated]=no&search[order]=count'
51
+ alias_url = 'https://danbooru.donmai.us/tag_aliases.json?commit=Search&limit=1000& search[order]=tag_count '
53
52
e6_base_url = 'https://e621.net/tags.json?limit=1000&search[hide_empty]=yes&search[is_deprecated]=no&search[order]=count'
54
- e6_alias_url = 'https://e621.net/tag_aliases.json?commit=Search&search%5Bconsequent_name%5D= '
53
+ e6_alias_url = 'https://e621.net/tag_aliases.json?commit=Search&limit=1000& search[order]=tag_count '
55
54
56
55
session = requests .Session ()
57
56
58
- class Complete (Exception ): pass
59
-
60
- def get_aliases (tags , name , url , max_date , session ):
61
- url = url + name
62
- while True :
63
- response = session .get (url ,headers = {"User-Agent" : "tag-list/3.0" })
64
- if response .status_code == 200 :
65
- aliases = {}
57
+ dan_aliases = collections .defaultdict (str )
58
+ e6_aliases = collections .defaultdict (str )
59
+
60
+
61
+ def backdate (tags , aliases , date ):
62
+ print (f"Clearing older aliases" )
63
+ filtered_aliases = {}
64
+ for key in aliases :
65
+ kept = []
66
+ for item in aliases [key ]:
67
+ entry_date = datetime .datetime .strptime (item [1 ][:10 ], "%Y-%m-%d" )
68
+ if entry_date <= date :
69
+ kept += [item [0 ]]
70
+ filtered_aliases [key ] = kept
71
+
72
+ #print(filtered_aliases)
73
+
74
+ for key in list (tags .keys ()): # prevents size change error
75
+ #print(f"Processing {key}")
76
+ if datetime .datetime .strptime (tags [key ][2 ][:10 ], "%Y-%m-%d" ) > date :
77
+ try :
78
+ new_key = filtered_aliases [key ].pop (0 )
79
+ value = tags .pop (key )
80
+ tags [new_key ] = value
81
+ except Exception as e :
82
+ #print(f"{key} removed\n{e}")
83
+ pass
84
+
85
+ # add aliases
86
+ for key in filtered_aliases :
87
+ try :
88
+ alias_string = "," .join (filtered_aliases [key ])
89
+ tags [key ] += [alias_string ]
90
+ except :
91
+ #print(f"{key} probably doesn't exist in one list or the other, likely a cuttoff thing")
92
+ pass
93
+
94
+
95
+ def get_aliases (url ,type ):
96
+ # create alias dictionary
97
+ try :
98
+ aliases = collections .defaultdict (list )
99
+ for page in range (1 ,5 ):
100
+ # Update the URL with the current page
101
+ url = f'{ url } &page={ page } '
102
+ # Fetch the JSON data
103
+ while True :
104
+ response = session .get (url ,headers = {"User-Agent" : "tag-list/2.0" })
105
+ if response .status_code == 200 :
106
+ break
107
+ else :
108
+ print (f"Couldn't reach server, Status: { response .status_code } .\n Retrying in 5 seconds" )
109
+ time .sleep (5 )
66
110
data = response .json ()
111
+ # Break the loop if the data is empty (no more tags to fetch)
112
+ if not data :
113
+ print (f'No more data found at page { page } . Stopping.' , flush = True )
114
+ break
67
115
for item in data :
68
- aliases [item ['antecedent_name' ]] = item ['antecedent_name' ],item ['created_at' ]
69
-
70
- aliases = {key : value for key , value in aliases .items ()
71
- if datetime .datetime .strptime (value [1 ][:10 ], "%Y-%m-%d" ) <= max_date }
72
-
73
- if datetime .datetime .strptime (tags [name ][2 ][:10 ], "%Y-%m-%d" ) >= max_date :
74
- try :
75
- previous_key = tags .pop (name )
76
- tags [aliases [0 ][0 ]] = previous_key
77
- lst_alias = []
78
- for index in range (1 , len (alias )):
79
- lst_alias += alias [index ][0 ]
80
- tags [aliases [0 ][0 ]] += [lst_alias ]
81
- dan_tags [aliases [0 ][0 ]] += ['' ] # safety index
82
- return
83
- except : # if there are no aliases for a tag which must be removed
84
- print (f"Removed { name } " )
85
- return
86
- dan_tags [name ] += ['' ] # safety index
87
- else :
88
- print ("Failed to get aliases, likely a connection error.\n Retrying in 5 seconds..." )
89
-
116
+ if type == "e" : # danbooru doesn't have post counts for aliases
117
+ if int (item ['post_count' ]) < int (minimum_count ):
118
+ raise Complete
119
+ aliases [item ['consequent_name' ]] += [[item ['antecedent_name' ],item ['created_at' ]]]
120
+ print (f'Page { page } aliases processed.' , flush = True )
121
+ time .sleep (0.1 ) # avoid cloudflare rate limit
122
+ except (Complete ):
123
+ print ("reached the post threshold" )
124
+ return (aliases )
90
125
126
+ #######
91
127
if "d" in boards :
92
128
dan_tags = {}
93
129
try :
94
- for page in range (1 , 1001 ):
130
+ for page in range (1 , 5 ):
95
131
# Update the URL with the current page
96
- url = f'{ dan_base_url } &page={ page } '
132
+ url = f'{ base_url } &page={ page } '
97
133
# Fetch the JSON data
98
- response = session .get (url ,headers = {"User-Agent" : "tag-list/3.0" })
99
- # Check if the request was successful
100
- if response .status_code == 200 :
101
- data = response .json ()
102
- # Break the loop if the data is empty (no more tags to fetch)
103
- if not data :
104
- print (f'No more data found at page { page } . Stopping.' , flush = True )
134
+ while True :
135
+ response = session .get (url ,headers = {"User-Agent" : "tag-list/2.0" })
136
+ if response .status_code == 200 :
105
137
break
106
-
107
- for item in data :
108
- if int (item ['post_count' ]) < int (minimum_count ): # break if below minimum count
109
- raise Complete
110
- if not str (item ['category' ]) in excluded :
111
- dan_tags [item ['name' ]] = [item ['category' ],item ['post_count' ],item ['created_at' ]]
112
- get_aliases (dan_tags , item ['name' ], dan_alias_url , max_date , session )
113
- else :
114
- print (f'Failed to fetch data for page { page } . HTTP Status Code: { response .status_code } ' , flush = True )
138
+ else :
139
+ print (f"Couldn't reach server, Status: { response .status_code } .\n Retrying in 5 seconds" )
140
+ time .sleep (5 )
141
+ data = response .json ()
142
+ # Break the loop if the data is empty (no more tags to fetch)
143
+ if not data :
144
+ print (f'No more data found at page { page } . Stopping.' , flush = True )
115
145
break
146
+
147
+ for item in data :
148
+ if int (item ['post_count' ]) < int (minimum_count ): # break if below minimum count
149
+ raise Complete
150
+ if not str (item ['category' ]) in excluded :
151
+ dan_tags [item ['name' ]] = [item ['category' ],item ['post_count' ],item ['created_at' ]]
116
152
print (f'Danbooru page { page } processed.' , flush = True )
117
- # Sleep for 0.5 second because we have places to be
118
- time .sleep (0.5 )
153
+ time .sleep (0.1 ) # avoid cloudflare rate limit
119
154
except (Complete ):
120
155
pass
121
156
157
+ if "d" in boards :
158
+ dan_aliases = get_aliases (alias_url , "d" )
159
+ backdate (dan_tags ,dan_aliases ,max_date )
160
+
161
+
122
162
if "e" in boards :
123
163
e6_tags = {}
124
164
try :
125
- for page in range (1 , 1001 ):
165
+ for page in range (1 , 2 ):
126
166
# Update the URL with the current page
127
167
url = f'{ e6_base_url } &page={ page } '
128
168
# Fetch the JSON data
129
- response = session .get (url ,headers = {"User-Agent" : "tag-list/3 .0" })
169
+ response = session .get (url ,headers = {"User-Agent" : "tag-list/2 .0" })
130
170
# Check if the request was successful
131
171
if response .status_code == 200 :
132
172
data = response .json ()
@@ -140,54 +180,67 @@ def get_aliases(tags, name, url, max_date, session):
140
180
raise Complete
141
181
if not str (item ['category' ]) in excluded :
142
182
e6_tags [item ['name' ]] = [item ['category' ],item ['post_count' ],item ['created_at' ]]
143
- get_aliases (e6_tags , item ['name' ], e6_alias_url , max_date , session )
144
183
else :
145
184
print (f'Failed to fetch data for page { page } . HTTP Status Code: { response .status_code } ' , flush = True )
146
185
break
147
- print (f'e6 page { page } processed.' , flush = True )
148
- # Sleep for 0.5 second because we have places to be
149
- time .sleep (0.5 )
150
- except (Complete ):
151
- pass
186
+ print (f'e621 page { page } processed.' , flush = True )
187
+ # e6 gets mad if you make more than 1 per second
188
+ time .sleep (1 )
189
+ except Complete :
190
+ print (f'All tags with { minimum_count } posts or greater have been scraped.' )
191
+
192
+ # e6 tags are fucked, a proper solution would take ~10 hours per list and I'm not going that far for furries
193
+ #if "e" in boards:
194
+ # e6_aliases = get_aliases(e6_alias_url, "e")
195
+ # backdate(e6_tags,e6_aliases,max_date)
196
+
152
197
153
198
# Merge boards
154
199
if ("d" in boards ) and ("e" in boards ):
155
200
for tag in dan_tags :
156
201
if tag in e6_tags :
157
202
e6_tags [tag ][1 ] += dan_tags [tag ][1 ] # combined count
203
+ """if e6_tags[tag][2] != None and dan_tags[tag][2] != None:
204
+ if e6_tags[tag][2] == "":
205
+ e6_tags[tag][2] += dan_tags[tag][2] # aliases
206
+ else:
207
+ e6_tags[tag][2] += "," + dan_tags[tag][2]"""
158
208
dan_tags .update (e6_tags )
159
209
full_tags = dan_tags
160
210
elif "d" in boards :
161
211
full_tags = dan_tags
162
212
else :
163
213
full_tags = e6_tags
164
214
215
+ # Open a file to write
165
216
print ("writing to file" )
166
217
with open (csv_filename , mode = 'w' , newline = '' , encoding = 'utf-8' ) as file :
167
218
writer = csv .writer (file )
168
219
# danbooru
169
220
# Write the data
170
221
for key , value in full_tags .items ():
171
222
if not str (value [0 ]) in excluded :
172
- if alias == 'n' :
173
- writer .writerow ([key ,value [0 ],value [1 ],'' ])
174
- else :
223
+ try :
175
224
writer .writerow ([key ,value [0 ],value [1 ],value [3 ]])
225
+ except :
226
+ writer .writerow ([key ,value [0 ],value [1 ],'' ]) #too lazy for a proper fix
176
227
# Explicitly flush the data to the file
177
228
file .close ()
178
229
179
- if dashes == 'y' :
180
- print (f'Replacing \' _\' with \' -\' ' )
181
- with open (csv_filename , 'r' , encoding = 'utf-8' ) as csvfile :
182
- reader = csv .reader (csvfile )
183
- with open (csv_filename .removesuffix ('-temp' ), 'w' , encoding = 'utf-8' , newline = '' ) as outfile :
184
- writer = csv .writer (outfile )
185
- for row in reader :
186
- if not row [0 ] in kaomojis :
187
- row [0 ] = row [0 ].replace ("_" , "-" )
188
- row [3 ] = row [3 ].replace ("_" , "-" )
189
- writer .writerow (row )
190
- outfile .close ()
191
- csvfile .close ()
192
- os .remove (csv_filename )
193
- csv_filename = csv_filename .removesuffix ('-temp' )
230
+ if dashes == 'y' :
231
+ print (f'Replacing \' _\' with \' -\' ' )
232
+ with open (csv_filename , 'r' , encoding = 'utf-8' ) as csvfile :
233
+ reader = csv .reader (csvfile )
234
+ with open (csv_filename .removesuffix ('-temp' ), 'w' , encoding = 'utf-8' , newline = '' ) as outfile :
235
+ writer = csv .writer (outfile )
236
+ for row in reader :
237
+ if not row [0 ] in kaomojis :
238
+ row [0 ] = row [0 ].replace ("_" , "-" )
239
+ row [3 ] = row [3 ].replace ("_" , "-" )
240
+ writer .writerow (row )
241
+ outfile .close ()
242
+ csvfile .close ()
243
+ os .remove (csv_filename )
244
+ csv_filename = csv_filename .removesuffix ('-temp' )
245
+
246
+ print (f'Data has been written to { csv_filename } ' , flush = True )
0 commit comments