Here is the code:
import json
import re
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:#[\w_]+)', # #-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
with open('mytweets.json', mode='r', encoding='utf-8') as f:
for line in f:
#line = f.readline()
tweet = json.loads(line)
print(preprocess(tweet['text']))
After running it showing the problems:
Getting the problem after running the codes
What is the solution of the problem?How i can successfully read data and tokenize the tweets from json format?
Here are some samples of mytweets.json
{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007261674602496,"id_str":"878007261674602496","text":"RT #wreckitroy: Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try t\u2026 ","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":632645991,"id_str":"632645991","name":"meche","screen_name":"mercedessreyes","location":null,"url":null,"description":"I mean, really it's same me, it's old me \u2022 FSU '21 \u2022 https:\/\/vsco.co\/onlymeche","protected":false,"verified":false,"followers_count":1039,"friends_count":352,"listed_count":6,"favourites_count":21860,"statuses_count":21676,"created_at":"Wed Jul 11 04:06:28 +0000 2012","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FCEBB6","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/762423763\/6c7d56ca20260816f75c10759208b283.png","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/762423763\/6c7d56ca20260816f75c10759208b283.png","profile_background_tile":true,"profile_link_color":"CE7834","profile_sidebar_border_color":"F0A830","profile_sidebar_fill_color":"78C0A8","profile_text_color":"5E412F","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/876886584087502848\/9WSQDm8F_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/876886584087502848\/9WSQDm8F_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/632645991\/1497147929","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Jun 21 02:57:42 +0000 2017","id":877359845074018304,"id_str":"877359845074018304","text":"Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try t\u2026 https:\/\/t.co\/lUJzY60Sn8","display_text_range":[0,140],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2341390003,"id_str":"2341390003","name":"roy","screen_name":"wreckitroy","location":"Fresno, CA","url":null,"description":"She said I'm looking like a bad man, smooth criminal. \ud83c\udf43 \/ snapchat\/instagram: thericharrow","protected":false,"verified":false,"followers_count":4831,"friends_count":1103,"listed_count":23,"favourites_count":79829,"statuses_count":1012,"created_at":"Thu Feb 13 04:30:59 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/876941549874978816\/eTGFmh8u_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/876941549874978816\/eTGFmh8u_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/2341390003\/1498157548","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877359034621468672,"quoted_status_id_str":"877359034621468672","quoted_status":{"created_at":"Wed Jun 21 02:54:29 +0000 2017","id":877359034621468672,"id_str":"877359034621468672","text":"When you trying so hard to getvout the friend zone\ud83d\ude02\ud83d\ude02 https:\/\/t.co\/i8yFNbGDNn","display_text_range":[0,52],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":844510650,"id_str":"844510650","name":"\u3164","screen_name":"DaddyGunPlay","location":null,"url":null,"description":"One of the best Contoller players dont #. Bo2 is surperior #JellyFam\ud83c\udf47","protected":false,"verified":false,"followers_count":325,"friends_count":276,"listed_count":3,"favourites_count":1795,"statuses_count":5009,"created_at":"Mon Sep 24 23:51:03 +0000 2012","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"3B94D9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/844510650\/1496174936","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877210813462740992,"quoted_status_id_str":"877210813462740992","is_quote_status":true,"retweet_count":45,"favorite_count":138,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/i8yFNbGDNn","expanded_url":"https:\/\/twitter.com\/wreckitroy\/status\/877210813462740992","display_url":"twitter.com\/wreckitroy\/sta\u2026","indices":[53,76]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"extended_tweet":{"full_text":"Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try to find the truth. \ud83d\ude09 https:\/\/t.co\/fv4Kqvv2sb","display_text_range":[0,134],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/fv4Kqvv2sb","expanded_url":"https:\/\/twitter.com\/daddygunplay\/status\/877359034621468672","display_url":"twitter.com\/daddygunplay\/s\u2026","indices":[135,158]}],"user_mentions":[],"symbols":[]}},"retweet_count":2496,"favorite_count":12594,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/lUJzY60Sn8","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/877359845074018304","display_url":"twitter.com\/i\/web\/status\/8\u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"quoted_status_id":877359034621468672,"quoted_status_id_str":"877359034621468672","quoted_status":{"created_at":"Wed Jun 21 02:54:29 +0000 2017","id":877359034621468672,"id_str":"877359034621468672","text":"When you trying so hard to getvout the friend zone\ud83d\ude02\ud83d\ude02 https:\/\/t.co\/i8yFNbGDNn","display_text_range":[0,52],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":844510650,"id_str":"844510650","name":"\u3164","screen_name":"DaddyGunPlay","location":null,"url":null,"description":"One of the best Contoller players dont #. Bo2 is surperior #JellyFam\ud83c\udf47","protected":false,"verified":false,"followers_count":325,"friends_count":276,"listed_count":3,"favourites_count":1795,"statuses_count":5009,"created_at":"Mon Sep 24 23:51:03 +0000 2012","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"3B94D9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/844510650\/1496174936","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877210813462740992,"quoted_status_id_str":"877210813462740992","is_quote_status":true,"retweet_count":45,"favorite_count":138,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/i8yFNbGDNn","expanded_url":"https:\/\/twitter.com\/wreckitroy\/status\/877210813462740992","display_url":"twitter.com\/wreckitroy\/sta\u2026","indices":[53,76]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"","expanded_url":null,"indices":[133,133]}],"user_mentions":[{"screen_name":"wreckitroy","name":"roy","id":2341390003,"id_str":"2341390003","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218426"}
{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007262320754692,"id_str":"878007262320754692","text":"It makes me feel some type of way now bree got another lil boy friend","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":47587983,"id_str":"47587983","name":"Kee Gotti","screen_name":"_BadGalKee","location":"Columbus, OH","url":null,"description":"\u2022 Instagram|_badgalkee \u2022 SnapChat| kbabiy","protected":false,"verified":false,"followers_count":1107,"friends_count":639,"listed_count":12,"favourites_count":1160,"statuses_count":28359,"created_at":"Tue Jun 16 09:46:12 +0000 2009","utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/850590447261167616\/MuywFrn8_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/850590447261167616\/MuywFrn8_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/47587983\/1487216863","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218580"}
{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007263310393344,"id_str":"878007263310393344","text":"I liked a #YouTube video https:\/\/t.co\/Znu4govqDi My Friend is in LOVE ...","source":"\u003ca href=\"http:\/\/www.google.com\/\" rel=\"nofollow\"\u003eGoogle\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":42287518,"id_str":"42287518","name":"David","screen_name":"iceman120","location":"FT LAUDERDALE, FL","url":"http:\/\/www.youtube.com\/iceman120dl","description":"\ue10e\ue10eOH YOU WANT SOME OF THIS\ue12f\ue12f\ue12f\ue12f\ue10e\ue10e","protected":false,"verified":false,"followers_count":4667,"friends_count":361,"listed_count":69,"favourites_count":134,"statuses_count":69716,"created_at":"Sun May 24 21:43:04 +0000 2009","utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/53704022\/ahamericanflag72.br.jpg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/53704022\/ahamericanflag72.br.jpg","profile_background_tile":false,"profile_link_color":"D60000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"1C1939","profile_text_color":"777777","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/511261204120363008\/DuNoXOXB_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/511261204120363008\/DuNoXOXB_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/42287518\/1375147278","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/Znu4govqDi","expanded_url":"http:\/\/youtu.be\/up6u1hzWHHc?a","display_url":"youtu.be\/up6u1hzWHHc?a","indices":[25,48]}],"user_mentions":[{"screen_name":"YouTube","name":"YouTube","id":10228272,"id_str":"10228272","indices":[10,18]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218816"}
You have posted samples, and as far as I see you just need to skip empty lines.
OLD ANSWER BELOW
You should parse json this way:
...
with open('mytweets.json', mode='r', encoding='utf-8') as f:
tweet = json.load(f)
...
json.load() accepts a file-like object as first argument.
What you're currently trying to do is reading file line by line and trying to parse each line as separate JSON string, and the file seems to be formatted, so you don't have complete json in any line.
You might want to iterate over tweets list in your file (if my guess is correct), not text lines and call print(preprocess()) in the loop.