I have the following script that collects all the tweets that contain the search terms $BTC or $ETH
import sys
import time
import json
import pandas as pd
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
USER_KEY = ''
USER_SECRET = ''
ACCESS_TOKEN = ''
ACCESS_SECRET = ''
class StdOutListener(StreamListener):
def on_data(self, data):
tweet = json.loads(data)
print(tweet)
def on_error(self, status):
print(status)
return False
if __name__ == "__main__":
listener = StdOutListener()
auth = OAuthHandler(USER_KEY, USER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
stream = Stream(auth, l)
stream.filter(languages=['en'], track=['$BTC', '$ETH'], async=True)
The problem is that there is a lot of spam within the output. For example:
Ideally, I would like to filter out the spam. So I checked the JSON-output of these type of a tweets.
{
'created_at': 'Fri Oct 05 07:09:19 +0000 2018',
'id': 1048107851829452800,
'id_str': '1048107851829452800',
'text': 'Current $ARK price: $0.69 \n\nWe checked! Binance registration is
currently open ???? ???? \n\n➡️ url_that_is_forbidden_by_stack_overflow…
url_that_is_forbidden_by_stack_overflow',
'display_text_range': [
0,
140
],
'source': '<a href="http://www.google.com"
rel="nofollow">medicinetoletitwin</a>',
'truncated': True,
'in_reply_to_status_id': None,
'in_reply_to_status_id_str': None,
'in_reply_to_user_id': None,
'in_reply_to_user_id_str': None,
'in_reply_to_screen_name': None,
'user': {
'id': 937395812748820480,
'id_str': '937395812748820480',
'name': 'Brenna ????',
'screen_name': 'BrennaPham',
'location': 'San Jose, CA',
'url': url_that_is_forbidden_by_stack_overflow,
'description': 'Gamer ~',
'translator_type': 'none',
'protected': False,
'verified': False,
'followers_count': 168,
'friends_count': 22,
'listed_count': 2,
'favourites_count': 17,
'statuses_count': 12620,
'created_at': 'Sun Dec 03 18:59:12 +0000 2017',
'utc_offset': None,
'time_zone': None,
'geo_enabled': False,
'lang': 'en',
'contributors_enabled': False,
'is_translator': False,
'profile_background_color': 'F5F8FA',
'profile_background_image_url': '',
'profile_background_image_url_https': '',
'profile_background_tile': False,
'profile_link_color': '1DA1F2',
'profile_sidebar_border_color': 'C0DEED',
'profile_sidebar_fill_color': 'DDEEF6',
'profile_text_color': '333333',
'profile_use_background_image': True,
'profile_image_url':
'http://pbs.twimg.com/profile_images/939260582460506112/oltTD-
f1_normal.jpg',
'profile_image_url_https':
'https://pbs.twimg.com/profile_images/939260582460506112/oltTD-
f1_normal.jpg',
'default_profile': True,
'default_profile_image': False,
'following': None,
'follow_request_sent': None,
'notifications': None
},
'geo': None,
'coordinates': None,
'place': None,
'contributors': None,
'is_quote_status': False,
'extended_tweet': {
'full_text': 'Current $ARK price: $0.69 \n\nWe checked! Binance
registration is currently open ???? ???? \n\n➡️
url_that_is_forbidden_by_stack_overflow\n\n$EOS $BCAP $RMC $TIME $RCN $XVC $PBL $BTC $WGR
$FLIXX $BTG $BCD $SMT $UKG $XWC $XEM $CRB $PASC $KMD $VIU $BNB $YOYOW
$INFX url_that_is_forbidden_by_stack_overflow',
'display_text_range': [
0,
236
],
'entities': {
'hashtags': [],
'urls': [
{
'url': 'url_that_is_forbidden_by_stack_overflow',
'expanded_url': 'http://binance.com/?ref=10078236',
'display_url': 'binance.com/?ref=10078236',
'indices': [
89,
112
]
}
],
'user_mentions': [],
'symbols': [
{
'text': 'ARK',
'indices': [
8,
12
]
},
{
'text': 'EOS',
'indices': [
114,
118
]
},
{
'text': 'BCAP',
'indices': [
119,
124
]
},
{
'text': 'RMC',
'indices': [
125,
129
]
},
{
'text': 'TIME',
'indices': [
130,
135
]
},
{
'text': 'RCN',
'indices': [
136,
140
]
},
{
'text': 'XVC',
'indices': [
141,
145
]
},
{
'text': 'PBL',
'indices': [
146,
150
]
},
{
'text': 'BTC',
'indices': [
151,
155
]
},
{
'text': 'WGR',
'indices': [
156,
160
]
},
{
'text': 'FLIXX',
'indices': [
161,
167
]
},
{
'text': 'BTG',
'indices': [
168,
172
]
},
{
'text': 'BCD',
'indices': [
173,
177
]
},
{
'text': 'SMT',
'indices': [
178,
182
]
},
{
'text': 'UKG',
'indices': [
183,
187
]
},
{
'text': 'XWC',
'indices': [
188,
192
]
},
{
'text': 'XEM',
'indices': [
193,
197
]
},
{
'text': 'CRB',
'indices': [
198,
202
]
},
{
'text': 'PASC',
'indices': [
203,
208
]
},
{
'text': 'KMD',
'indices': [
209,
213
]
},
{
'text': 'VIU',
'indices': [
214,
218
]
},
{
'text': 'BNB',
'indices': [
219,
223
]
},
{
'text': 'YOYOW',
'indices': [
224,
230
]
},
{
'text': 'INFX',
'indices': [
231,
236
]
}
],
'media': [
{
'id': 1048107850399137792,
'id_str': '1048107850399137792',
'indices': [
237,
260
],
'media_url': 'http://pbs.twimg.com/media/DougiW1WwAA_8PZ.jpg',
'media_url_https': 'https://pbs.twimg.com/media/DougiW1WwAA_8PZ.jpg',
'url': 'url_that_is_forbidden_by_stack_overflow',
'display_url': 'pic.twitter.com/2Y1evrvz3x',
'expanded_url': 'https://twitter.com/BrennaPham/status/1048107851829452800/photo/1',
'type': 'photo',
'sizes': {
'thumb': {
'w': 150,
'h': 150,
'resize': 'crop'
},
'large': {
'w': 1920,
'h': 1080,
'resize': 'fit'
},
'medium': {
'w': 1200,
'h': 675,
'resize': 'fit'
},
'small': {
'w': 680,
'h': 383,
'resize': 'fit'
}
}
}
]
},
'extended_entities': {
'media': [
{
'id': 1048107850399137792,
'id_str': '1048107850399137792',
'indices': [
237,
260
],
'media_url': 'http://pbs.twimg.com/media/DougiW1WwAA_8PZ.jpg',
'media_url_https': 'https://pbs.twimg.com/media/DougiW1WwAA_8PZ.jpg',
'url': url_that_is_forbidden_by_stack_overflow,
'display_url': 'pic.twitter.com/2Y1evrvz3x',
'expanded_url': 'https://twitter.com/BrennaPham/status/1048107851829452800/photo/1',
'type': 'photo',
'sizes': {
'thumb': {
'w': 150,
'h': 150,
'resize': 'crop'
},
'large': {
'w': 1920,
'h': 1080,
'resize': 'fit'
},
'medium': {
'w': 1200,
'h': 675,
'resize': 'fit'
},
'small': {
'w': 680,
'h': 383,
'resize': 'fit'
}
}
}
]
}
},
'quote_count': 0,
'reply_count': 0,
'retweet_count': 0,
'favorite_count': 0,
'entities': {
'hashtags': [],
'urls': [
{
'url': 'url_that_is_forbidden_by_stack_overflow',
'expanded_url': 'http://binance.com/?ref=10078236',
'display_url': 'binance.com/?ref=10078236',
'indices': [
89,
112
]
},
{
'url': 'url_that_is_forbidden_by_stack_overflow',
'expanded_url': 'https://twitter.com/i/web/status/1048107851829452800',
'display_url': 'twitter.com/i/web/status/1…',
'indices': [
114,
137
]
}
],
'user_mentions': [],
'symbols': [
{
'text': 'ARK',
'indices': [
8,
12
]
}
]
},
'favorited': False,
'retweeted': False,
'possibly_sensitive': False,
'filter_level': 'low',
'lang': 'en',
'timestamp_ms': '1538723359435'
I identified the following criteria to classify a tweet as spam:
- tweet['friend_count'] < 50
- tweet['followers_count'] > 50
- tweet['entities']['urls']['display_url'] contains link to specific spam-websites
As a result, I wrote a new function in the script: import sys import time import json import pandas as pd from tweepy import OAuthHandler from tweepy import Stream from tweepy.streaming import StreamListener
USER_KEY = ''
USER_SECRET = ''
ACCESS_TOKEN = ''
ACCESS_SECRET = ''
spam = ['spam_website_1', spam_website_2', 'spam_website_3']
class StdOutListener(StreamListener):
def on_data(self, data):
tweet = json.loads(data)
self.filter_tweet(tweet)
def filter_tweet(self, tweet):
url = data['entities']['urls']['display_url'] if 'display_url' in
data['entities']['urls'] else None
if len(url) != 0):
if data['user']['friends_count'] < 50:
if data['user']['followers_count'] < 50:
if any(x in url for x in spam):
pass
else:
print(tweet)
def on_error(self, status):
print(status)
return False
if __name__ == "__main__":
listener = StdOutListener()
auth = OAuthHandler(USER_KEY, USER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
stream = Stream(auth, l)
stream.filter(languages=['en'], track=['$BTC', '$ETH'], async=True)
I would like to know if this part is fast enough for the Streaming API output or should be rewritten. Is it possible to write this more elegantly and faster?
if len(url) != 0):
if data['user']['friends_count'] < 50:
if data['user']['followers_count'] < 50:
if any(x in url for x in spam):
pass