I have a similar problem, I downloaded a large tweet file from the net saved it as data.txt and loaded into R using rstudio (import dataset). but had errors and cannot continue.
This is step by step on what i did and the errors i had.
# required packages
library(twitteR)
library(plyr)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
library(tm)
library(XML)
library(SnowballC)
data<- read.csv("~/data/datasStream.txt", header=FALSE , sep = "," )
I have 3425 observations and 97 variables
## i load it to corpus
corpus = Corpus(VectorSource(data)) ## 97 elements 17.4 MB
## i cleaned the data using
corpus = tm_map (corpus, tolower)
corpus = tm_map (corpus, stripWhitespace)
corpus = tm_map (corpus, stemDocument)
corpus = tm_map (corpus, PlainTextDocument)
# remove unnecessary spaces
corpus = gsub("[ \t]{2,}", "", corpus)
corpus = gsub("^\\s+|\\s+$", "", corpus)
# remove NAs in corpus
corpus = corpus[!is.na(corpus)]
dtm = DocumentTermMatrix(corpus)
dtm
<<DocumentTermMatrix (documents: 97, terms: 151132)>>
Non-/sparse entries: 201231/14458573
Sparsity : 99%
Maximal term length: 1775
Weighting : term frequency (tf)
adtm <- removeSparseTerms(dtm, 0.75)
adtm
<<DocumentTermMatrix (documents: 97, terms: 270)>>
Non-/sparse entries: 11962/14228
Sparsity : 54%
Maximal term length: 33
Weighting : term frequency (tf)
df1 = as.data.frame (m=as.matrix (adtm))
Error in as.data.frame.default(dtm) : cannot coerce class "c("DocumentTermMatrix", "simple_triplet_matrix")" to a data.frame
How can i resolve this problem? I want to perform a k-means clustering and word cloud with the data.
This is a sample data:
{"created_at":"Wed Feb 27 14:24:12 +0000 2013","id":306771719996186625,"id_str":"306771719996186625","text":"@Joeypearce we've got another bellend coming to see the car I'm having too help clean :-/ I'll see you when work ends ! X","source":"\u003ca href=\"http://twitter.com/download/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c/a\u003e","truncated":false,"in_reply_to_status_id":306763650054627328,"in_reply_to_status_id_str":"306763650054627328","in_reply_to_user_id":127665137,"in_reply_to_user_id_str":"127665137","in_reply_to_screen_name":"Joeypearce","user":{"id":274997668,"id_str":"274997668","name":"Ell Beaton \u00a9","screen_name":"Ell_Beaton","location":"","url":null,"description":"Go Glen, Or Go Home.","protected":false,"followers_count":147,"friends_count":85,"listed_count":0,"created_at":"Thu Mar 31 12:44:39 +0000 2011","favourites_count":132,"utc_offset":0,"time_zone":"London","geo_enabled":true,"verified":false,"statuses_count":1087,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http://a0.twimg.com/profile_background_images/768018009/7a0b3fe303f234e8d6a5429bb9ede9a9.jpeg","profile_background_image_url_https":"https://si0.twimg.com/profile_background_images/768018009/7a0b3fe303f234e8d6a5429bb9ede9a9.jpeg","profile_background_tile":true,"profile_image_url":"http://a0.twimg.com/profile_images/3304123896/606a7413bce208a1a38b1eb41fd017c9_normal.jpeg","profile_image_url_https":"https://si0.twimg.com/profile_images/3304123896/606a7413bce208a1a38b1eb41fd017c9_normal.jpeg","profile_banner_url":"https://si0.twimg.com/profile_banners/274997668/1361751912","profile_link_color":"F50E0E","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[52.43718380,-2.14324244]},"coordinates":{"type":"Point","coordinates":[-2.14324244,52.43718380]},"place":{"id":"ddeec3dc241e5b6a","url":"http://api.twitter.com/1/geo/id/ddeec3dc241e5b6a.json","place_type":"city","name":"Dudley","full_name":"Dudley, Dudley","country_code":"GB","country":"United Kingdom","bounding_box":{"type":"Polygon","coordinates":[[[-2.191947,52.426012],[-2.191947,52.558221],[-2.011849,52.558221],[-2.011849,52.426012]]]},"attributes":{}},"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Joeypearce","name":"Joey Pearce","id":127665137,"id_str":"127665137","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium"}
dput(adtm)
instead? – Qaswed