This repository was archived by the owner on Jul 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjson_from_twitter.py
94 lines (70 loc) · 3.01 KB
/
json_from_twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
# encoding: utf-8
# This script saves tweets in JSON instead of avro. Each line of the output file is a JSON-encoded tweet with metadata
import datetime
import json
import tweepy
import argparse
import os
import sys
import signal
import configparser
config = configparser.ConfigParser()
config.read('config.ini')
# Twitter API credentials
consumer_key = config['twitter']['consumer_key']
consumer_secret = config['twitter']['consumer_secret']
access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']
writer=None
class MyStreamListener(tweepy.StreamListener):
def __init__(self):
super().__init__()
self.counter = 0
def on_status(self, status):
self.counter += 1
text = status.text
if hasattr(status, 'retweeted_status'):
text = status.retweeted_status.text
author = status.author.screen_name
print('{4} [{3}] {0} {1}: {2}'.format(status.id, author, text, self.counter, datetime.datetime.now().isoformat()))
tags = list(map(lambda h: h['text'], status.entities['hashtags']))
if status.user.location is not None:
tags.append('loc:' + status.user.location.replace(',',''))
tags.append('lang:' + status.lang)
tags.append('src:' + status.source)
writer.write(json.dumps({"text": text,
"source": 'twitter.com/{1}/status//{0}'.format(status.id, author),
"tags": tags,
"timestamp": status.created_at.isoformat()
}))
writer.write('\n')
parser = argparse.ArgumentParser(description="store Tweets from a stream. it runs continuously until stopped")
parser.add_argument("-languages", type=str, help="the languages of the tweets, comma delimited", default=None)
parser.add_argument("-track", type=str, help="the keywords to search", default=None)
parser.add_argument("-output_file", type=str, help="the one-JSON-per-line serialized file", default="twitter_utterances.json")
args = parser.parse_args()
writer = open(args.output_file, "a+")
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth=api.auth, listener=myStreamListener)
track=args.track
if track is not None:
track = track.split(',')
print('will filter using the search keys: ' + ','.join(track))
else:
print('will NOT filter on tweet content, no keywords were specified')
languages=args.languages
if languages is not None:
languages = languages.split(',')
print('will filter the languages: ' + ','.join(languages))
else:
print('will NOT filter on tweet language, no languages were specified')
def signal_handler(signal, frame):
print('\nSIGINT received, closing file and exiting...')
writer.close()
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
myStream.filter(track=track, async=True, languages=languages)