python tweet parsing - python

I'm trying to parse tweets data.
My data shape is as follows:
59593936 3061025991 null null <d>2009-08-01 00:00:37</d> <s><a href="http://help.twitter.com/index.php?pg=kb.page&id=75" rel="nofollow">txt</a></s> <t>honda just recalled 440k accords...traffic around here is gonna be light...win!!</t> ajc8587 15 24 158 -18000 0 0 <n>adrienne conner</n> <ud>2009-07-23 21:27:10</ud> <t>eastern time (us & canada)</t> <l>ga</l>
22020233 3061032620 null null <d>2009-08-01 00:01:03</d> <s><a href="http://alexking.org/projects/wordpress" rel="nofollow">twitter tools</a></s> <t>new blog post: honda recalls 440k cars over airbag risk http://bit.ly/2wsma</t> madcitywi 294 290 9098 -21600 0 0 <n>madcity</n> <ud>2009-02-26 15:25:04</ud> <t>central time (us & canada)</t> <l>madison, wi</l>
I want to get the total numbers of tweets and the numbers of keyword related tweets. I prepared the keywords in text file. In addition, I wanna get the tweet text contents, total number of tweets which contain mention(#), retweet(RT), and URL (I wanna save every URL in other file).
So, I coded like this.
import time
import os
total_tweet_count = 0
related_tweet_count = 0
rt_count = 0
mention_count = 0
URLs = {}
def get_keywords(filepath):
with open(filepath) as f:
for line in f:
yield line.split()
for line in open('/nas/minsu/2009_06.txt'):
tweet = line.strip()
total_tweet_count += 1
with open('./related_tweets.txt', 'a') as save_file_1:
keywords = get_keywords('./related_keywords.txt', 'r')
if keywords in line:
text = line.split('<t>')[1].split('</t>')[0]
if 'http://' in text:
try:
url = text.split('http://')[1].split()[0]
url = 'http://' + url
if url not in URLs:
URLs[url] = []
URLs[url].append('\t' + text)
save_file_3 = open('./URLs_in_related_tweets.txt', 'a')
print >> save_file_3, URLs
except:
pass
if '#' in text:
mention_count +=1
if 'RT' in text:
rt_count += 1
related_tweet_count += 1
print >> save_file_1, text
save_file_2 = open('./info_related_tweets.txt', 'w')
print >> save_file_2, str(total_tweet_count) + '\t' + srt(related_tweet_count) + '\t' + str(mention_count) + '\t' + str(rt_count)
save_file_1.close()
save_file_2.close()
save_file_3.close()
The keyword set likes
Happy
Hello
Together
I think my code has many problem, but the first error is as follws:
Traceback (most recent call last):
File "health_related_tweets.py", line 21, in <module>
keywords = get_keywords('./public_health_related_words.txt', 'r')
TypeError: get_keywords() takes exactly 1 argument (2 given)
Please help me out!

The issue is self explanatory in the error, you have specified two parameters in your call to get_keywords() but your implementation only has one parameter. You should change your get_keywords implementation to something like:
def get_keywords(filepath, mode):
with open(filepath, mode) as f:
for line in f:
yield line.split()
Then you can use the following line without that specific error:
keywords = get_keywords('./related_keywords.txt', 'r')

Now you are getting this error:
Traceback (most recent call last): File "health_related_tweets.py", line 23, in if keywords in line: TypeError: 'in ' requires string as left operand, not generator
The reason is that keywords = get_keywords(...) returns a generator. Logically thinking about it, keywords should be a list of all the keywords. And for each keyword in this list, you want to check if it's in the tweet/line or not.
Sample code:
keywords = get_keywords('./related_keywords.txt', 'r')
has_keyword = False
for keyword in keywords:
if keyword in line:
has_keyword = True
break
if has_keyword:
# Your code here (for the case when the line has at least one keyword)
(The above code would be replacing if keywords in line:)

Related

List index out of range with stanford-nlp

I'm trying to remove all blank lines from a large .txt file but whatever method I use it always returns this traceback:
Traceback (most recent call last):
File "C:\Users\svp12\PycharmProjects\practiques\main.py", line 53, in <module>
doc = nlp(texts[line])
IndexError: list index out of range
If I don't remove these spaces then I get IndexErrors on the consequent 2 for loops (or at least I think that's the reason), that's why I'm using the the try/except like this:
try:
for word in doc.sentences[0].words:
noun.append(word.text)
lemma.append(word.lemma)
pos.append(word.pos)
xpos.append(word.xpos)
deprel.append(word.deprel)
except IndexError:
errors += 1
pass
I'd like to be able to remove all blank lines and not have to avoid IndexErrors like this, any idea on how to fix?
Here's the whole code:
import io
import stanza
import os
def linecount(filename):
ffile = open(filename, 'rb')
lines = 0
buf_size = 1024 * 1024
read_f = ffile.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
return lines
errors = 0
with io.open('#_Calvia_2018-01-01_2022-04-01.txt', 'r+', encoding='utf-8') as f:
text = f.read()
# replacing eos with \n, numbers and symbols
texts = text.replace('eos', '.\n')
texts = texts.replace('0', ' ').replace('1', ' ').replace('2', ' ').replace('3', ' ').replace('4', ' ')\
.replace('5', ' ').replace('6', ' ').replace('7', ' ').replace('8', ' ').replace('9', ' ').replace(',', ' ')\
.replace('"', ' ').replace('·', ' ').replace('?', ' ').replace('¿', ' ').replace(':', ' ').replace(';', ' ')\
.replace('-', ' ').replace('!', ' ').replace('¡', ' ').replace('.', ' ').splitlines()
os.system("sed -i \'/^$/d\' #_Calvia_2018-01-01_2022-04-01.txt") # removing empty lines to avoid IndexError
nlp = stanza.Pipeline(lang='ca')
nouns = []
lemmas = []
poses = []
xposes = []
heads = []
deprels = []
total_lines = linecount('#_Calvia_2018-01-01_2022-04-01.txt') - 1
for line in range(50): # range should be total_lines which is 6682
noun = []
lemma = []
pos = []
xpos = []
head = []
deprel = []
# print('analyzing: '+str(line+1)+' / '+str(len(texts)), end='\r')
doc = nlp(texts[line])
try:
for word in doc.sentences[0].words:
noun.append(word.text)
lemma.append(word.lemma)
pos.append(word.pos)
xpos.append(word.xpos)
deprel.append(word.deprel)
except IndexError:
errors += 1
pass
try:
for word in doc.sentences[0].words:
head.extend([lemma[word.head-1] if word.head > 0 else "root"])
except IndexError:
errors += 1
pass
nouns.append(noun)
lemmas.append(lemma)
poses.append(pos)
xposes.append(xpos)
heads.append(head)
deprels.append(deprel)
print(nouns)
print(lemmas)
print(poses)
print(xposes)
print(heads)
print(deprels)
print("errors: " + str(errors)) # wierd, seems to be range/2-1
And as a side question, is worth to import os just for this line? (which is the one removing the blank lines
os.system("sed -i \'/^$/d\' #_Calvia_2018-01-01_2022-04-01.txt")
I can't guarantee that this works because I couldn't test it, but it should give you an idea of how you'd approach this task in Python.
I'm omitting the head processing/the second loop here, that's for you to figure out.
I'd recommend you throw some prints in there and look at the output, make sure you understand what's going on (especially with different data types) and look at examples of applications using Stanford NLP, watch some tutorials online (from start to finish, no skipping), etc.
import stanza
import re
def clean(line):
# function that does the text cleaning
line = line.replace('eos', '.\n')
line = re.sub(r'[\d,"·?¿:;!¡.-]', ' ', line)
return line.strip()
nlp = stanza.Pipeline(lang='ca')
# instead of individual variables, you could keep the values in a dictionary
# (or just leave them as they are - your call)
values_to_extract = ['text', 'lemma', 'pos', 'xpos', 'deprel']
data = {v:[] for v in values_to_extract}
with open('#_Calvia_2018-01-01_2022-04-01.txt', 'r', encoding='utf-8') as f:
for line in f:
# clean the text
line = clean(line)
# skip empty lines
if not line:
continue
doc = nlp(line)
# loop over sentences – this will work even if it's an empty list
for sentence in doc.sentences:
# append a new list to the dictionary entries
for v in values_to_extract:
data[v].append([])
for word in sentence.words:
for v in values_to_extract:
# extract the attribute (e.g.,
# a surface form, a lemma, a pos tag, etc.)
attribute = getattr(word, v)
# and add it to its slot
data[v][-1].append(attribute)
for v in values_to_extract:
print('Value:', v)
print(data[v])
print()
Because texts doesn't have 50 lines, why do you hardcode 50?
If you just need to remove blank lines you only have to do text = text.replace("\n\n","\n")
if you need to remove lines that are just whitespaces you can just do:
text = '\n'.join(line.rstrip() for line in text.split('\n') if line.strip())

How to fix TypeError: 'int' object is not callable from a divided number

Im trying to create a program to generate text with usernames from a txt file but I keep getting a TypeError: 'int' object is not iterable i know what this means but I have no idea how to fix my issue. I tried just doing y = 12 / 2 and the same error came up when i passed the for loop y i am really confused so if someone could help me that would be great
This is my code
def generateNum():
#imports random
from random import randint
for _ in range(10):
value = randint(0, 900000)
return(str(value))
def getNumOfLines( file):
#opens txt file
with open(file) as f:
Lines = f.readlines()
count = 0
# Strips the newline character
for line in Lines:
count += 1
return(count)
class debug:
def __init__(self, credsTxt, tagsTxt):
self.credsTxt = credsTxt
self.tagsTxt = tagsTxt
self.numOfCreds = getNumOfLines(credsTxt)
self.numOfTags = getNumOfLines(tagsTxt)
self.ammountPerAccount = round(self.numOfTags / self.numOfCreds)
def getComments(self):
#initializes comment
comment = ""
#opens txt file
file1 = open(self.tagsTxt, 'r')
count = 0
while True:
count += 1
# Get next line from file
line = file1.readline()
for i in self.ammountPerAccount:
# if line is empty
# end of file is reached
if not line:
break
comment += ' ' + line.strip() + ' ' + generateNum() + '.'
return(comment)
print(debug('D:/FiverrWork/user/instagram-bot/textGen/assets/login_Info.txt', 'D:/FiverrWork/user/instagram-bot/textGen/assets/tags.txt').getComments())
this is my stack trace error
Traceback (most recent call last):
File "d:\FiverrWork\user\textgenerator\textgenerator\txt.py", line 57, in <module>
print(debug('D:/FiverrWork/user/textgenerator/textgenerator/assets/login_Info.txt', 'D:/FiverrWork/user/textgenerator/textgenerator/assets/tags.txt').getComments())
File "d:\FiverrWork\user\textgenerator\textgenerator\txt.py", line 47, in getComments
for i in self.ammountPerAccount():
TypeError: 'int' object is not callable
Your for loop as posted cannot iterate over an int. You meant to iterate over a range():
for _ in range(self.ammountPerAccount):
# if line is empty
# end of file is reached
if not line:
break
comment += ' ' + line.strip() + ' ' + generateNum() + '.'
I used _ as a placeholder variable since the actual value of i each time was not used.

split() issues with pdf extractText()

I'm working on a minor content analysis program that I was hoping that I could have running through several pdf-files and return the sum of frequencies that some specific words are mentioned in the text. The words that are searched for are specified in a separate text file (list.txt) and can be altered. The program runs just fine through files with .txt format, but the result is completely different when running the program on a .pdf file. To illustrate, the test text that I have the program running trhough is the following:
"Hello
This is a product development notice
We’re working with innovative measures
A nice Innovation
The world that we live in is innovative
We are currently working on a new process
And in the fall, you will experience our new product development introduction"
The list of words grouped in categories are the following (marked in .txt file with ">>"):
innovation: innovat
product: Product, development, introduction
organization: Process
The output from running the code with a .txt file is the following:
Whereas the ouput from running it with a .pdf is the following:
As you can see, my issue is pertaining to the splitting of the words, where in the .pdf output i can have a string like "world" be split into 'w','o','rld'. I have tried to search for why this happens tirelessly, without success. As I am rather new to Python programming, I would appreciate any answe or direction to where I can fin and answer to why this happens, should you know any source.
Thanks
The code for the .txt is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.txt'
textfile = open(f)
text = textfile.read().split() # lowercase the text
print (text)
textfile.close()
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in text:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
While the code for the .pdf is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.pdf'
textfile = open(f, 'rb')
text = PyPDF2.PdfFileReader(textfile)# lowercase the text
for pageNum in range(0, text.numPages):
texts = text.getPage(pageNum)
textfile = texts.extractText().split()
print (textfile)
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in textfile:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])

Python JSON KeyError for non missing key

For some unknown reason, when I run the below script, the following error is returned along with the desired output. For some reason, this was working without any errors last night. The API output does change every minute but I wouldn't expect a KeyError to be returned. I can't simply pinpoint where this error is coming from:
[u'#AAPL 151204C00128000'] <----- What I want to see printed
Traceback (most recent call last):
File "Options_testing.py", line 60, in <module>
main()
File "Options_testing.py", line 56, in main
if quotes[x]['greeks']['impvol'] > 0: #change this for different greek vals
KeyError: 'impvol'
Here is a little snippet of data:
{"results":{"optionchain":{"expire":"all","excode":"oprac","equityinfo":{"longname":"Apple Inc","shortname":"AAPL"},"money":"at","callput":"all","key":{"symbol":["AAPL"],"exLgName":"Nasdaq Global Select","exShName":"NGS","exchange":"NGS"},"symbolstring":"AAPL"},"quote":[{"delaymin":15,"contract":{"strike":108,"openinterest":3516,"contracthigh":6.16,"contractlow":0.02,"callput":"Put","type":"WEEK","expirydate":"2015-11-13"},"root":{"equityinfo":{"longname":"Apple Inc","shortname":"AAPL"},"key":{"symbol":["AAPL"],"exLgName":"Nasdaq Global Select","exShName":"NGS","exchange":"NGS"}},"greeks":{"vega":0,"theta":0,"gamma":0,"delta":0,"impvol":0,"rho":0}
Code:
#Options screener using Quotemedia's API
import json
import requests
#import csv
def main():
url_auth= "https://app.quotemedia.com/user/g/authenticate/v0/102368/XXXXX/XXXXX"
decode_auth = requests.get(url_auth)
#print decode_auth.json()
#print(type(decode_auth))
auth_data = json.dumps(decode_auth.json())
#Parse decode_auth, grab 'sid'
sid_parsed = json.loads(auth_data)["sid"]
#print sid_parsed
#Pass sid into qm_options
#Construct URL
symbol = 'AAPL'
SID = sid_parsed
url_raw = 'http://app.quotemedia.com/data/getOptionQuotes.json?webmasterId=102368'
url_data = url_raw + '&symbol=' + symbol + '&greeks=true' + '&SID=' + SID
#print url_data
response = requests.get(url_data)
#print response
data = json.dumps(response.json())
#print data
#save data to a file
with open('AAPL_20151118.json', 'w') as outfile:
json.dumps (data, outfile)
#Turn into json object
obj = json.loads(data)
#slim the object
quotes = obj['results']['quote']
#find the number of options contracts
range_count = obj['results']['symbolcount']
#print all contracts with an implied vol > 0
for x in range(0,range_count):
if quotes[x]['greeks']['impvol'] > 0: #change this for different greek vals
print quotes[x]['key']['symbol']
if __name__ == '__main__':
main()
I can provide sample data if necessary.
for x in range(0,range_count):
if quotes[x]['greeks']['impvol'] > 0: #change this for different greek vals
print quotes[x]['key']['symbol']
This loops throug multiple quotes, so maybe there is even just one that does not have an impvol property.
You should add some error handling, so you find out when that happens. Something like this:
# no need to iterate over indexes, just iterate over the items
for quote in quotes:
if 'greeks' not in quote:
print('Quote does not contain `greeks`:', quote)
elif 'impvol' not in quote['greeks']:
print('Quote does not contain `impvol`:', quote)
elif quote['greeks']['impvol'] > 0:
print quote['key']['symbol']

list / string in python

I'm trying to parse tweets data.
My data shape is as follows:
59593936 3061025991 null null <d>2009-08-01 00:00:37</d> <s><a href="http://help.twitter.com/index.php?pg=kb.page&id=75" rel="nofollow">txt</a></s> <t>honda just recalled 440k accords...traffic around here is gonna be light...win!!</t> ajc8587 15 24 158 -18000 0 0 <n>adrienne conner</n> <ud>2009-07-23 21:27:10</ud> <t>eastern time (us & canada)</t> <l>ga</l>
22020233 3061032620 null null <d>2009-08-01 00:01:03</d> <s><a href="http://alexking.org/projects/wordpress" rel="nofollow">twitter tools</a></s> <t>new blog post: honda recalls 440k cars over airbag risk http://bit.ly/2wsma</t> madcitywi 294 290 9098 -21600 0 0 <n>madcity</n> <ud>2009-02-26 15:25:04</ud> <t>central time (us & canada)</t> <l>madison, wi</l>
I want to get the total numbers of tweets and the numbers of keyword related tweets. I prepared the keywords in text file. In addition, I wanna get the tweet text contents, total number of tweets which contain mention(#), retweet(RT), and URL (I wanna save every URL in other file).
So, I coded like this.
import time
import os
total_tweet_count = 0
related_tweet_count = 0
rt_count = 0
mention_count = 0
URLs = {}
def get_keywords(filepath, mode):
with open(filepath, mode) as f:
for line in f:
yield line.split().lower()
for line in open('/nas/minsu/2009_06.txt'):
tweet = line.strip().lower()
total_tweet_count += 1
with open('./related_tweets.txt', 'a') as save_file_1:
keywords = get_keywords('./related_keywords.txt', 'r')
if keywords in line:
text = line.split('<t>')[1].split('</t>')[0]
if 'http://' in text:
try:
url = text.split('http://')[1].split()[0]
url = 'http://' + url
if url not in URLs:
URLs[url] = []
URLs[url].append('\t' + text)
save_file_3 = open('./URLs_in_related_tweets.txt', 'a')
print >> save_file_3, URLs
except:
pass
if '#' in text:
mention_count +=1
if 'RT' in text:
rt_count += 1
related_tweet_count += 1
print >> save_file_1, text
save_file_2 = open('./info_related_tweets.txt', 'w')
print >> save_file_2, str(total_tweet_count) + '\t' + srt(related_tweet_count) + '\t' + str(mention_count) + '\t' + str(rt_count)
save_file_1.close()
save_file_2.close()
save_file_3.close()
Following is the sample keywords
Depression
Placebo
X-rays
X-ray
HIV
Blood preasure
Flu
Fever
Oral Health
Antibiotics
Diabetes
Mellitus
Genetic disorders
I think my code has many problem, but the first error is as follows:
line 23, in <module>
if keywords in line:
TypeError: 'in <string>' requires string as left operand, not generator
I coded "def ..." part. I think it has a problem. When I try "print keywords" under line (keywords = get_keywords('./related_keywords.txt', 'r')), it gives something strange numbers not words.... . Please help me out!
Maybe change if keywords in line: to use a regular expression match instead. For example, something like:
import re
...
keywords = "|".join(get_keywords('./related_keywords.txt', 'r'))
matcher = re.compile(keywords)
if matcher.match(line):
text = ...
... And changed get_keywords to something like this instead:
def get_keywords(filepath, mode):
keywords = []
with open(filepath, mode) as f:
for line in f:
sp = line.split()
for w in sp:
keywords.append(w.lower())
return keywords

Categories