0
votes

I have the following script that collects all the tweets that contain the search terms $BTC or $ETH

import sys
import time
import json
import pandas as pd
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener

USER_KEY = ''
USER_SECRET = ''
ACCESS_TOKEN = ''
ACCESS_SECRET = ''

class StdOutListener(StreamListener):

def on_data(self, data):
   tweet = json.loads(data)
   print(tweet)


def on_error(self, status):
   print(status)    
   return False

if __name__ == "__main__":
listener =  StdOutListener()
auth = OAuthHandler(USER_KEY, USER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
stream = Stream(auth, l)
stream.filter(languages=['en'], track=['$BTC', '$ETH'], async=True)

The problem is that there is a lot of spam within the output. For example:

enter image description here

Ideally, I would like to filter out the spam. So I checked the JSON-output of these type of a tweets.

{
 'created_at': 'Fri Oct 05 07:09:19 +0000 2018',
 'id': 1048107851829452800,
 'id_str': '1048107851829452800',
 'text': 'Current $ARK price: $0.69 \n\nWe checked! Binance registration is 
  currently open ???? ????  \n\n➡️ url_that_is_forbidden_by_stack_overflow… 
  url_that_is_forbidden_by_stack_overflow',
 'display_text_range': [
   0,
   140
  ],
 'source': '<a href="http://www.google.com" 
  rel="nofollow">medicinetoletitwin</a>',
 'truncated': True,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {
  'id': 937395812748820480,
  'id_str': '937395812748820480',
  'name': 'Brenna ????',
  'screen_name': 'BrennaPham',
  'location': 'San Jose, CA',
  'url': url_that_is_forbidden_by_stack_overflow,
  'description': 'Gamer ~',
  'translator_type': 'none',
  'protected': False,
  'verified': False,
  'followers_count': 168,
  'friends_count': 22,
  'listed_count': 2,
  'favourites_count': 17,
  'statuses_count': 12620,
  'created_at': 'Sun Dec 03 18:59:12 +0000 2017',
  'utc_offset': None,
  'time_zone': None,
  'geo_enabled': False,
  'lang': 'en',
  'contributors_enabled': False,
  'is_translator': False,
  'profile_background_color': 'F5F8FA',
  'profile_background_image_url': '',
  'profile_background_image_url_https': '',
  'profile_background_tile': False,
  'profile_link_color': '1DA1F2',
  'profile_sidebar_border_color': 'C0DEED',
  'profile_sidebar_fill_color': 'DDEEF6',
  'profile_text_color': '333333',
  'profile_use_background_image': True,
  'profile_image_url': 
  'http://pbs.twimg.com/profile_images/939260582460506112/oltTD- 
   f1_normal.jpg',
  'profile_image_url_https': 
  'https://pbs.twimg.com/profile_images/939260582460506112/oltTD- 
   f1_normal.jpg',
  'default_profile': True,
  'default_profile_image': False,
  'following': None,
  'follow_request_sent': None,
  'notifications': None
   },
 'geo': None,
 'coordinates': None,
 'place': None,
 'contributors': None,
 'is_quote_status': False,
 'extended_tweet': {
  'full_text': 'Current $ARK price: $0.69 \n\nWe checked! Binance 
   registration is currently open ???? ????  \n\n➡️ 
   url_that_is_forbidden_by_stack_overflow\n\n$EOS $BCAP $RMC $TIME $RCN $XVC $PBL $BTC $WGR 
   $FLIXX $BTG $BCD $SMT $UKG $XWC $XEM $CRB $PASC $KMD $VIU $BNB $YOYOW 
   $INFX url_that_is_forbidden_by_stack_overflow',
  'display_text_range': [
   0,
   236
   ],
 'entities': {
  'hashtags': [],
  'urls': [
    {
      'url': 'url_that_is_forbidden_by_stack_overflow',
      'expanded_url': 'http://binance.com/?ref=10078236',
      'display_url': 'binance.com/?ref=10078236',
      'indices': [
        89,
        112
      ]
    }
  ],
  'user_mentions': [],
  'symbols': [
    {
      'text': 'ARK',
      'indices': [
        8,
        12
      ]
    },
    {
      'text': 'EOS',
      'indices': [
        114,
        118
      ]
    },
    {
      'text': 'BCAP',
      'indices': [
        119,
        124
      ]
    },
    {
      'text': 'RMC',
      'indices': [
        125,
        129
      ]
    },
    {
      'text': 'TIME',
      'indices': [
        130,
        135
      ]
    },
    {
      'text': 'RCN',
      'indices': [
        136,
        140
      ]
    },
    {
      'text': 'XVC',
      'indices': [
        141,
        145
      ]
    },
    {
      'text': 'PBL',
      'indices': [
        146,
        150
      ]
    },
    {
      'text': 'BTC',
      'indices': [
        151,
        155
      ]
    },
    {
      'text': 'WGR',
      'indices': [
        156,
        160
      ]
    },
    {
      'text': 'FLIXX',
      'indices': [
        161,
        167
      ]
    },
    {
      'text': 'BTG',
      'indices': [
        168,
        172
      ]
    },
    {
      'text': 'BCD',
      'indices': [
        173,
        177
      ]
    },
    {
      'text': 'SMT',
      'indices': [
        178,
        182
      ]
    },
    {
      'text': 'UKG',
      'indices': [
        183,
        187
      ]
    },
    {
      'text': 'XWC',
      'indices': [
        188,
        192
      ]
    },
    {
      'text': 'XEM',
      'indices': [
        193,
        197
      ]
    },
    {
      'text': 'CRB',
      'indices': [
        198,
        202
      ]
    },
    {
      'text': 'PASC',
      'indices': [
        203,
        208
      ]
    },
    {
      'text': 'KMD',
      'indices': [
        209,
        213
      ]
    },
    {
      'text': 'VIU',
      'indices': [
        214,
        218
      ]
    },
    {
      'text': 'BNB',
      'indices': [
        219,
        223
      ]
    },
    {
      'text': 'YOYOW',
      'indices': [
        224,
        230
      ]
    },
    {
      'text': 'INFX',
      'indices': [
        231,
        236
      ]
    }
  ],
  'media': [
    {
      'id': 1048107850399137792,
      'id_str': '1048107850399137792',
      'indices': [
        237,
        260
      ],
      'media_url': 'http://pbs.twimg.com/media/DougiW1WwAA_8PZ.jpg',
      'media_url_https': 'https://pbs.twimg.com/media/DougiW1WwAA_8PZ.jpg',
      'url': 'url_that_is_forbidden_by_stack_overflow',
      'display_url': 'pic.twitter.com/2Y1evrvz3x',
      'expanded_url': 'https://twitter.com/BrennaPham/status/1048107851829452800/photo/1',
      'type': 'photo',
      'sizes': {
        'thumb': {
          'w': 150,
          'h': 150,
          'resize': 'crop'
        },
        'large': {
          'w': 1920,
          'h': 1080,
          'resize': 'fit'
        },
        'medium': {
          'w': 1200,
          'h': 675,
          'resize': 'fit'
        },
        'small': {
          'w': 680,
          'h': 383,
          'resize': 'fit'
        }
      }
    }
  ]
},
'extended_entities': {
  'media': [
    {
      'id': 1048107850399137792,
      'id_str': '1048107850399137792',
      'indices': [
        237,
        260
      ],
      'media_url': 'http://pbs.twimg.com/media/DougiW1WwAA_8PZ.jpg',
      'media_url_https': 'https://pbs.twimg.com/media/DougiW1WwAA_8PZ.jpg',
      'url': url_that_is_forbidden_by_stack_overflow,
      'display_url': 'pic.twitter.com/2Y1evrvz3x',
      'expanded_url': 'https://twitter.com/BrennaPham/status/1048107851829452800/photo/1',
      'type': 'photo',
      'sizes': {
        'thumb': {
          'w': 150,
          'h': 150,
          'resize': 'crop'
        },
        'large': {
          'w': 1920,
          'h': 1080,
          'resize': 'fit'
        },
        'medium': {
          'w': 1200,
          'h': 675,
          'resize': 'fit'
        },
        'small': {
          'w': 680,
          'h': 383,
          'resize': 'fit'
        }
      }
    }
  ]
}
 },
'quote_count': 0,
'reply_count': 0,
'retweet_count': 0,
'favorite_count': 0,
'entities': {
 'hashtags': [],
 'urls': [
  {
    'url': 'url_that_is_forbidden_by_stack_overflow',
    'expanded_url': 'http://binance.com/?ref=10078236',
    'display_url': 'binance.com/?ref=10078236',
    'indices': [
      89,
      112
    ]
  },
  {
    'url': 'url_that_is_forbidden_by_stack_overflow',
    'expanded_url': 'https://twitter.com/i/web/status/1048107851829452800',
    'display_url': 'twitter.com/i/web/status/1…',
    'indices': [
      114,
      137
    ]
  }
 ],
 'user_mentions': [],
 'symbols': [
  {
    'text': 'ARK',
    'indices': [
      8,
      12
    ]
  }
 ]
},
'favorited': False,
'retweeted': False,
'possibly_sensitive': False,
'filter_level': 'low',
'lang': 'en',
'timestamp_ms': '1538723359435'

I identified the following criteria to classify a tweet as spam:

  • tweet['friend_count'] < 50
  • tweet['followers_count'] > 50
  • tweet['entities']['urls']['display_url'] contains link to specific spam-websites

As a result, I wrote a new function in the script: import sys import time import json import pandas as pd from tweepy import OAuthHandler from tweepy import Stream from tweepy.streaming import StreamListener

USER_KEY = ''
USER_SECRET = ''
ACCESS_TOKEN = ''
ACCESS_SECRET = ''
spam = ['spam_website_1', spam_website_2', 'spam_website_3']

class StdOutListener(StreamListener):

def on_data(self, data):
   tweet = json.loads(data)
   self.filter_tweet(tweet)


def filter_tweet(self, tweet):
    url = data['entities']['urls']['display_url'] if 'display_url' in 
    data['entities']['urls'] else None
    if len(url) != 0):
       if data['user']['friends_count'] < 50:
          if data['user']['followers_count'] < 50:
              if any(x in url for x in spam):
                pass
    else:
       print(tweet)

def on_error(self, status):
   print(status)    
   return False

if __name__ == "__main__":
listener =  StdOutListener()
auth = OAuthHandler(USER_KEY, USER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
stream = Stream(auth, l)
stream.filter(languages=['en'], track=['$BTC', '$ETH'], async=True)

I would like to know if this part is fast enough for the Streaming API output or should be rewritten. Is it possible to write this more elegantly and faster?

 if len(url) != 0):
       if data['user']['friends_count'] < 50:
          if data['user']['followers_count'] < 50:
              if any(x in url for x in spam):
                pass
1

1 Answers

0
votes

As a comment, you wrote tweet['followers_count'] > 50 then data['user']['followers_count'] < 50 in the code.

Filtering spam is a hard job, i guess even Twitter can't solve it. I don't think this is a question about followers or friends count. And even if you write a black list of scam and spam sites, it will change every day. Maybe you could also make a black lists of users.

According to me, if tests are very fast (i have some experience with chess programming). So, yes you can check while streaming. But if your tests will grow, then you may push directly tweets without any process, for example in Redis. Then another script (worker) can do the job to get out tweets from redis and make spam tests.

EDIT : another idea for spam criteria : the number of tweets posted per day. The user you gave as an example is publishing nearly 42 tweets per day. So a test could be if tweetsPerDay > 10: spam. (I get this count number with the user account created at and the date now, converted to days, and the total tweets count, then calculate the average). But it may be not very accurate, like checking followers and friends count.