-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawlbot.py
125 lines (100 loc) · 4.28 KB
/
crawlbot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from __future__ import division
import argparse
from subprocess import STDOUT, call, TimeoutExpired
import os
# import pymongo
import datetime
import json
import csv
GO_CRAWL_CMD = "python3"
GO_CRAWL_IN_PATH = "{}/gocrawl_in.py".format(os.path.dirname(os.path.abspath(__file__)))
GO_CRAWL_FB_PATH = "{}/gocrawl_fb.py".format(os.path.dirname(os.path.abspath(__file__)))
def csv_len(fname):
with open(fname) as f:
csvreader = csv.reader(f)
row_count = sum(1 for row in csvreader)
return row_count
def main():
# Arguments #
parser = argparse.ArgumentParser(description='Pengtai Instagram Crawler')
parser.add_argument('-d', '--dir_prefix', type=str,
default='./data', help='directory to save results')
parser.add_argument('-q', '--query', type=str,
help="target to crawl, add '#' for hashtags")
parser.add_argument('-t', '--crawl_type', type=str,
default='all', help="Options: 'all' | 'tags' | 'photos' | 'following'")
parser.add_argument('-n', '--number', type=int, default=0,
help='Number of posts to download: integer')
parser.add_argument('-k', '--sns_kind', type=str,
default='in', help="Options: 'in' | 'fb'")
parser.add_argument('-l', '--headless', action='store_true',
help='If set, will use PhantomJS driver to run script as headless')
parser.add_argument('-a', '--authentication', type=str, default='auth.json',
help='path to authentication json file')
parser.add_argument('-s', '--setting', type=str, default='settings.json',
help='path to setting json file')
parser.add_argument('-e', '--env', type=str, default='pro',
help="environment options: 'pro' | 'dev' | 'test'")
parser.add_argument('-r', '--random', action='store_true',
help='enables tags mode with random hashtags @ setting.json')
args = parser.parse_args()
# End Argparse #
# VARIABLES #
now = datetime.datetime.now()
DIR_PREFIX = "{}/".format(os.path.dirname(os.path.abspath(__file__)))
setting = None
GO_CRAWL_PATH = GO_CRAWL_FB_PATH if args.sns_kind == 'fb' else GO_CRAWL_IN_PATH
DB_CURRENT_CNT = 0
loop_cnt = int(args.number / 500)
# img directory check
dir_path = os.path.join(DIR_PREFIX, 'img')
if not os.path.exists(dir_path):
os.makedirs(dir_path)
# Setting
with open(args.setting) as data_file:
setting = json.load(data_file)
# daily db post count check
# DB connection
# connection = pymongo.MongoClient(setting['DB_HOST'], setting['DB_PORT'])
# db_name = setting['DB_NAME']
# db = connection[db_name]
# collectionName = "{}-explore-{}-Collection".format(args.sns_kind, now.strftime("%Y-%m-%d"))
# collection = db[collectionName]
# DB_CNT = collection.find({}).count()
# DB_TOBE_CNT = DB_CNT+args.number
# !! CHANGE FROM DB CONNECTION TO FILE SYSTEM !!
DB_CNT = 0
csv_filename = "{}-explore-{}".format(args.sns_kind, now.strftime("%Y-%m-%d"))
csv_file_loc = "{}/{}.csv".format(args.dir_prefix, csv_filename)
if os.path.exists(csv_file_loc):
DB_CNT = csv_len(csv_file_loc)
else:
with open(csv_file_loc, 'w') as file:
file.writelines("id,img,text,has_tag,write_date,reg_date\n")
DB_TOBE_CNT = DB_CNT + args.number
while DB_TOBE_CNT > DB_CURRENT_CNT:
# print(args.crawl_type)
cmd_arr = [GO_CRAWL_CMD, GO_CRAWL_PATH,
'-d=' + csv_file_loc,
'-t=' + args.crawl_type,
'-n=' + str(500),
'-a=' + args.authentication,
'-s=' + args.setting,
'-e=' + args.env]
if args.query:
cmd_arr.append('-q={}'.format(args.query))
elif args.random:
cmd_arr.append('-r')
if args.headless:
cmd_arr.append('-l')
# subprocess.call(cmd_arr)
# try:
call(cmd_arr)
# except TimeoutExpired as e:
# continue
# finally:
# DB_CURRENT_CNT = collection.find({}).count()
DB_CURRENT_CNT = csv_len(csv_file_loc)
# for num in range(loop_cnt):
if __name__ == "__main__":
main()