0
votes

I'm trying to run the code below to clean a set of tweets in a txt file

I defined the arguments on the command line too but nothing seems to be outputted

Any idea what I may be doing wrong?

Here is the code below:

Code:

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
import os
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import pos_tag


def clean(path, filename):

    # print("Cleaning "+path)

    filename = CLEANED_DATA + filename.strip()
    WRITE_HANDLER = open(filename, 'wb')
    tweets = dict()
    for line in open('/Users/Mustafa/Desktop/nexalogy/project3.txt',
                 'rb'):
        line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE)  # removes the characters specified
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '', line,
                  flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = filter(lambda x: x in string.printable, line)  # filter non-ascii characers

        new_line = ''
        for i in line.split():  # remove @ and #words, punctuataion
            if not i.startswith('@') and not i.startswith('#') and i \
                not in string.punctuation:
                new_line += i + ' '
        line = new_line

        # # Do sentence correction

        if new_line in tweets:
            continue
        else:
            tweets[new_line] = 1
        if len(new_line.strip()) > 0:
           WRITE_HANDLER.write(new_line + '''

''')
    return filename


DATA_FOLDER = sys.argv[1]
CLEANED_DATA = sys.argv[2]
for (root, dirs, files) in os.walk(DATA_FOLDER):  # gets all the files from 
subfolders recrsively
    for name in files:
        absolute_path = os.path.join(root, name)
        if os.path.isfile(absolute_path) and name != '.DS_Store':
        filename = clean(absolute_path, name)

File : Project3.txt

{"created_at":"Tue Oct 04 17:16:30 +0000 2016","id":783355126945722368,"id_str":"783355126945722368","text":"RT @Jacquiecharles: USAID providing $400,000 in initial assistance for humanitarian partners (not GOH) to rapidly provide critical relief.\u2026","truncated":false,"entities":{"hashtags":[],"symbols":[],"user_mentions":[{"screen_name":"Jacquiecharles","name":"Jacqueline Charles","id":15360434,"id_str":"15360434","indices":[3,18]}],"urls":[]},"metadata":{"iso_language_code":"en","result_type":"recent"},"source":"Twitter for iPhone<\/a>","in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":635031678,"id_str":"635031678","name":"Tracie Hamilton","screen_name":"TracieHamilton8","location":"","description":"Leaning & depending on Him everyday","url":null,"entities":{"description":{"urls":[]}},"protected":false,"followers_count":1929,"friends_count":715,"listed_count":63,"created_at":"Fri Jul 13 23:39:46 +0000 2012","favourites_count":27603,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":17433,"lang":"en","contributors_enabled":false,"is_translator":false,"is_translation_enabled":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/575645183288610817\/5vJNgPld_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/575645183288610817\/5vJNgPld_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"has_extended_profile":false,"default_profile":true,"default_profile_image":false,"following":false,"follow_request_sent":false,"notifications":false},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Tue Oct 04 01:27:02 +0000 2016","id":783116185726357504,"id_str":"783116185726357504"

========================================================================

2

2 Answers

3
votes

Here is a fully working version:

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
import os
import re
import string

def clean(inputDir, outputFile):

    # print("Cleaning "+path)

    WRITE_HANDLER = open(outputFile, 'wb')
    tweets = dict()
    for line in open(inputDir + './project3.json','rb'):
        # print "Before" + line
        line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE)  # removes the characters specified
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = filter(lambda x: x in string.printable, line)  # filter non-ascii characers

        new_line = ''
        for i in line.split():  # remove @ and #words, punctuataion
            if not i.startswith('@') and not i.startswith('#') and i not in string.punctuation:
                new_line += i + ' '
        line = new_line

        # # Do sentence correction

        if new_line in tweets:
            continue
        else:
            tweets[new_line] = 1
        if len(new_line.strip()) > 0:
            #print  "Writing new line"
            WRITE_HANDLER.write(new_line + '''''')
    return outputFile


DATA_FOLDER = sys.argv[1]
CLEANED_DATA = sys.argv[2]
clean(DATA_FOLDER, CLEANED_DATA)

You invoke it by:

python clean.py inputDirectory outputFileName
1
votes

There are a couple of problems with your code:

  1. You are hard-coding the input file in the 'for line in open' statement
  2. Your output file name is not going to clean.txt. It is going to be clean.txt, clean.txt... There will be one created for each file in your directory
  3. There was some strange indentation
  4. The JSON that you posted was all on one line, so it was stripped out by the statement that removed punctuation
  5. You are passing a filename, but trying to walk the os filesystem based on that file name. You should pass:

    python clean.py DIR_NAME CLEAN_FILE

After fixing the indentation, and prettifying the JSON, I get correct output:

def clean(path, filename):

    # print("Cleaning "+path)

    filename = CLEANED_DATA + filename.strip()
    print filename
    WRITE_HANDLER = open(filename, 'wb')
    tweets = dict()
    for line in open('./project3.json','rb'):
        # print "Before" + line
        line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE)  # removes the characters specified
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = filter(lambda x: x in string.printable, line)  # filter non-ascii characers

        new_line = ''
        for i in line.split():  # remove @ and #words, punctuataion
            if not i.startswith('@') and not i.startswith('#') and i not in string.punctuation:
                new_line += i + ' '
        line = new_line

        # # Do sentence correction

        if new_line in tweets:
            continue
        else:
            tweets[new_line] = 1
        if len(new_line.strip()) > 0:
            #print  "Writing new line"
            WRITE_HANDLER.write(new_line + '''''')
    return filename