Convert rows of CSV file to a list of tuples? - python

I have a .CSV file that has two columns one for Tweet and the other for sentiment value formatted like so (but for thousands of tweets):
I like stackoverflow,Positive
Thanks for your answers,Positive
I hate sugar,Negative
I do not like that movie,Negative
stackoverflow is a question and answer site,Neutral
Python is oop high-level programming language,Neutral
I would like to get the output like this:
negfeats = [('I do not like that movie','Negative'),('I hate sugar','Negative')]
posfeats = [('I like stackoverflow','Positive'),('Thanks for your answers','Positive')]
neufeats = [('stackoverflow is a question and answer site','Neutral'),('Python is oop high-level programming language','Neutral')]
I have tried this below to do so but I got some missing chars in tuples. Also, how can I keep x, y, and z as an integer and not a float?
import csv
neg = ['Negative']
pos = ['Positive']
neu = ['Neutral']
neg_counter=0
pos_counter=0
neu_counter=0
negfeats = []
posfeats = []
neufeats = []
with open('ff_tweets.csv', 'Ur') as f:
for k in f:
if any(word in k for word in neg):
negfeats = list(tuple(rec) for rec in csv.reader(f, delimiter=','))
neg_counter+=1
elif any(word in k for word in pos):
posfeats = list(tuple(rec) for rec in csv.reader(f, delimiter=','))
pos_counter+=1
else:
neufeats = list(tuple(rec) for rec in csv.reader(f, delimiter=','))
neu_counter+=1
x = neg_counter * 3/4
y = pos_counter * 3/4
z = neu_counte * 3/4
print negfeats
print posfeats
print neufeats
print x
print y
print z

This should work
import csv
neg = 'Negative'
pos = 'Positive'
neu = 'Neutral'
negfeats = []
posfeats = []
neufeats = []
with open('ff_tweets.csv', 'Ur') as f:
for r in csv.reader(f):
if r[1] == neg:
negfeats.append((r[0], r[1]))
if r[1] == pos:
posfeats.append((r[0], r[1]))
if r[1] == neu:
neufeats.append((r[0], r[1]))
x = len(negfeats) * float(3)/4
y = len(posfeats) * float(3)/4
z = len(neufeats) * float(3)/4
print negfeats
print posfeats
print neufeats
print x
print y
print z

Try this, using Pandas. 'Sentiment' is a column in the csv file:
import pandas as pd
df = pd.read_csv('ff_tweets.csv')
pos = tuple(df.loc[df['Sentiment'] == 'Positive'].apply(tuple, axis = 1))
neu = tuple(df.loc[df['Sentiment'] == 'Neutral'].apply(tuple, axis = 1))
neg = tuple(df.loc[df['Sentiment'] == 'Negative'].apply(tuple, axis = 1))
print pos, neg, neu
Output:
(('I like stackoverflow', 'Positive'), ('Thanks for your answers', 'Positive')) (('I hate sugar', 'Negative'), ('I do not like that movie', 'Negative')) (('stackoverflow is a question and answer site', 'Neutral'), ('Python is oop high-level programming language', 'Neutral'))

Related

I want to parallelize this code to execute faster for 800000 sentences

from app import getPhonemes
import pandas as pd
import sys
triphones = []
def phonemize(sentence):
tokens = sentence.split(' ')
phonemes = getPhonemes(tokens)
return '$'.join(phonemes)
def generateTriphones(phonemes):
triphones = []
for i in range(len(phonemes)):
for j in range(len(phonemes)):
for k in range(len(phonemes)):
triphones.append(phonemes[i] + ' ' + phonemes[j] + ' ' + phonemes[k])
return triphones
def scoreSentence(sentence,phonemes):
flag = 0
global triphones
score = 0
tokens = sentence.split('$')
uniqueTokens = set(tokens)
triphoneticTokens = [token for token in uniqueTokens if token.count(' ') > 1]
for token in triphoneticTokens:
for triphone in triphones:
if token.find(triphone) != -1:
score += 1
triphones.remove(triphone)
if triphones == []:
flag = -1
return score, flag
def Process(fil):
global triphones
file = open('itudict/vocab.phoneme', 'r',encoding='utf-8')
data = []
for line in file:
data.append(line.strip())
file.close()
phonemes = data[4:]
triphones = generateTriphones(phonemes)
data = pd.read_csv(fil+'.csv')
data = data.drop(['score','covered_vocab'],axis=1)
i = 1
while len(data) > 0:
print('Processing File: '+str(i))
sentencee = data[:10000]
data = data[10000:]
sentences = sentencee['sentence'].tolist()
phonemes = []
scores = []
for j in range(len(sentences)):
if j%1000 == 0:
print('Processing Sentence: '+str(j))
print(len(triphones))
phones = phonemize(sentences[j])
score, flag = scoreSentence(phones,phonemes)
if flag == -1:
data = []
phonemes.append(phones)
scores.append(score)
data['Phonemes'] = phonemes
data['score'] = scores
data.to_csv(fil+'phonemized'+str(i)+'.csv', index=False)
i += 1
if __name__ == '__main__':
Process(sys.argv[1])
I am trying to generate the phonemes for 800000 sentences. The model which am using is G2P which phonemizes the sentence. after phonemization i am calculating the scores. the phoneme array which i am using for calculating scores is of size 2620000.
The length of sentences are 800000 and the code is taking days, can somebody parallelize this code or suggest some solution
I want to parallelize this code to execute faster.

Python For Loop Count and Insert Numbers beside words

I have a problem regarding Python, I want to count the list item and then put a number beside the value of text.
This is the output I want:
test = 1
me = 2
texting = 3
This is the output I always get:
test = 3
me = 3
texting = 3
Here is my line of code:
text = request.form['title']
text2 = text.splitlines()
count = len(text2)
textarray = []
x = 0;
while(x <count):
for txt in text2:
textarray = [txt + " = " + str(x) for txt in text2]
x = x+1
string = '<br>'.join(textarray)
return render_template('index.html', text=string)
Fix
You don't need 2 loops, just iterate over text2 and increment your xn then append to the array, don't recreate it wit nonsense
textarray = []
x = 1
for txt in text2:
textarray.append(txt + " = " + str(x))
x = x + 1
Improve
Use enumerate to generate increasing value along with an iterable
textarray = []
for idx, txt in enumerate(text2, 1):
textarray.append(f"{txt} = {idx}")
Best
Use generator construction and inline it
text = "test\nme\ntexting"
result = '</br>'.join(
(f"{word}={idx}" for idx, word in enumerate(text.splitlines(), 1))
)
# return render_template('index.html', text=result)

Calculating Sentiment with Pandas - Looping Calculation is Slow

I have a pandas dataframe consisting of headlines. I am doing a simple calculation of the sentiment, by tokenizing and comparing the headlines with a list of positive and negative words. I am appending the over all sentiment for the headline into a column and then appending this to the original dataframe and saving as an Excel file.
The resulting and original files are about 12 mb. While the code below works, it is slow; and is taking me a couple of hours to fully read the file and assign the score. Is this normal? Is there anything I can do to speed up the process? I understand that loops within a pandas dataframe column may be slow - what are the alternatives?
# -*- coding: utf-8 -*-
from nltk.tokenize import word_tokenize
import pandas as pd
from violencevocabulary import new_words as extended_neg_list
import unicodedata
#function to calculate sentiment
def sentimentanalyzer (country_name,text_type):
data = []
xls_file = pd.ExcelFile('/UsersDesktop/MasterData.xlsx')
df = xls_file.parse(country_name)
text_body = df[text_type]
text_body = pd.Series(text_body)
headlines = text_body.tolist()
for i in headlines:
if type(i) == unicode:
i = unicodedata.normalize('NFKD', i).encode('ascii','ignore')
data.append(i)
# processing the sentiment comparispon files
pos_words = []
neg_words = []
f = open('/Users/positive-words.txt','r')
plines = f.readlines()
for line in plines:
line = line.rstrip('\n')
line = line.lower()
pos_words.append(line)
positive_words = pos_words[35:]
f.close()
g = open('/Users/Desktop/negative-words.txt','r')
nlines = g.readlines()
neg_words = []
for nline in nlines:
nline = nline.strip('\n')
nline = nline.lower()
neg_words.append(nline)
negative_words = neg_words[35:]
g.close()
negative_words = negative_words + extended_neg_list
senti_list = []
for j in data:
tokens = word_tokenize(j)
for k in tokens:
negs = [k for k in tokens if k in negative_words]
negs = len(negs)
pos = [k for k in tokens if k in positive_words]
pos = len(pos)
calc = pos - negs
print calc
senti_list.append(calc)
df2 = pd.Series(senti_list,name="Sentiment")
new_data = pd.concat([df,df2,],axis=1)
new_data_name = '/Users/Desktop/Results/' + country_name + " " + text_type + ".xls"
writer_new_data_name = pd.ExcelWriter(new_data_name, engine='xlsxwriter')
new_data.to_excel(writer_new_data_name,sheet_name='Sheet1')
return

Writing a list from a for loop into a csv

I wrote a for loop that iterates through a CSV to get a list like this:
[t1, s1]
[t2, s2]
[t3, s3]
and so 4 thousand times.
Now I need to write these into a new CSV file, where they'd populate 2 fields and be separated by a comma.
When I enter this, I only get the last list from the last loop, and with one character in a cell.
def sentiment_analysis():
fo = open("positive_words.txt", "r")
positive_words = fo.readlines()
fo.close()
positive_words = map(lambda positive_words: positive_words.strip(), positive_words)
fo = open("negative_words.txt", "r")
negative_words = fo.readlines()
fo.close()
negative_words = map(lambda negative_words: negative_words.strip(), negative_words)
fo = open("BAC.csv", "r")
data = fo.readlines()
fo.close()
data = map(lambda data: data.strip(), data)
x1 = 0 #number of bullish
x2 = 0 #number of bearish
x3 = 0 #number of unknown
for info in data:
data_specs = info.split(',')
time_n_date = data_specs[0]
sentiment = data_specs[2]
'''Possibly precede with a nested for loop for data_specs???'''
if sentiment == 'Bullish':
'''fo.write(time + ',' + 'Bullish' + '\n')'''
elif sentiment == 'Bearish':
''' fo.write(time + ',' + 'Bearish' + '\n')'''
else:
x3 += 1
positive = 0
negative = 0
content_words = data_specs[1].split()
for a in positive_words:
for b in content_words:
if (a == b):
positive = positive + 1
for c in negative_words:
for d in content_words:
if (c == d):
negative = negative + 1
if positive > negative:
'''fo.write(time + ',' + 'Bullish' + '\n')'''
sentiment = 'Bullish'
elif positive < negative:
sentiment = 'Bearish'
else:
sentiment = 'Neutral'
bac2data = [time_n_date, sentiment]
print bac2data
fo = open("C:\Users\Siddhartha\Documents\INFS 772\Project\Answer\BAC2_answer.csv", "w")
for x in bac2data:
w = csv.writer(fo, delimiter = ',')
w.writerows(x)
fo.close()
My for loop isn't going through it all.
In your code bac2data = [time_n_date, sentiment] creates a list containing 2 string items. The proper way to write that to a CSV file with csv.writer() is with writerow(bac2data).
The last part of your code contains a number of errors. Firstly you are opening the CSV file in write mode ('w') for every line of the incoming data. This will overwrite the file each time, losing all data except the last line. Then you are iterating over the bac2data list and calling writerows() on each item. That's going to write each character from the string on it's own line (which matches your reported output).
Instead, open the output file and create a csv.writer outside of the main for info in data: loop:
fo = open("C:\Users\Siddhartha\Documents\INFS 772\Project\Answer\BAC2_answer.csv", "w")
writer = csv.writer(fo)
for info in data:
....
Then replace these lines at the bottom of the main loop:
bac2data = [time_n_date, sentiment]
print bac2data
fo = open("C:\Users\Siddhartha\Documents\INFS 772\Project\Answer\BAC2_answer.csv", "w")
for x in bac2data:
w = csv.writer(fo, delimiter = ',')
w.writerows(x)
fo.close()
with this:
bac2data = [time_n_date, sentiment]
print bac2data
writer.writerow(bac2data)
Once you have that working, and no longer need to print bac2data for debugging, you can just use 1 line:
writer.writerow((time_n_date, sentiment)]
Update
Complete code for function:
def sentiment_analysis():
fo = open("positive_words.txt", "r")
positive_words = fo.readlines()
fo.close()
positive_words = map(lambda positive_words: positive_words.strip(), positive_words)
fo = open("negative_words.txt", "r")
negative_words = fo.readlines()
fo.close()
negative_words = map(lambda negative_words: negative_words.strip(), negative_words)
fo = open("BAC.csv", "r")
data = fo.readlines()
fo.close()
data = map(lambda data: data.strip(), data)
x1 = 0 #number of bullish
x2 = 0 #number of bearish
x3 = 0 #number of unknown
fo = open("C:\Users\Siddhartha\Documents\INFS 772\Project\Answer\BAC2_answer.csv", "w")
writer = csv.writer(fo)
for info in data:
data_specs = info.split(',')
time_n_date = data_specs[0]
sentiment = data_specs[2]
'''Possibly precede with a nested for loop for data_specs???'''
if sentiment == 'Bullish':
'''fo.write(time + ',' + 'Bullish' + '\n')'''
elif sentiment == 'Bearish':
''' fo.write(time + ',' + 'Bearish' + '\n')'''
else:
x3 += 1
positive = 0
negative = 0
content_words = data_specs[1].split()
for a in positive_words:
for b in content_words:
if (a == b):
positive = positive + 1
for c in negative_words:
for d in content_words:
if (c == d):
negative = negative + 1
if positive > negative:
'''fo.write(time + ',' + 'Bullish' + '\n')'''
sentiment = 'Bullish'
elif positive < negative:
sentiment = 'Bearish'
else:
sentiment = 'Neutral'
bac2data = [time_n_date, sentiment]
print bac2data
writer.writerow(bac2data)
fo.close()

how to align strings in python without creating a table

I have a problem. I'm trying to print a serie of lists in python to have it with a vertical align. My code is:
def show():
book = "data.txt"
f = open(book,'r')
line = f.readlines()
f.close()
x=0
z = ''
l = []
x = []
i = 0
starting = '{:>4} {:>15} {:>15}'.format('Name', "Gender", "Year")
print(starting)
for p in line:
p = p.replace(',',' ')
x = p.index(' ')
name = p[0:x]
a = p.index('e 1')
gender = p[x:a+1]
year = p[(a+2):]
if len(name) == 3:
line_new = '{:>2} {:>15} {:>15}'.format(name, gender, year)
else:
line_new = '{:>5} {:>15} {:>15}'.format(name, gender, year)
print(line_new)
The problem is that I'm trying to have something like:
I want to put all the names of the left (and I don't have problems) then, under Gender, I want to create an equal list of Genders all on the same vertical and same thing for year
Untested, but try this:
import itertools
with open("data.txt") as data:
pep = [line.strip().split(',') for line in data]
widths = [len(max(r, key=len)) for r in itertools.izip_longest(*pep, fillvalue="")]
print "%-{0}%s%-{1}%s%-{2}%s".format(widths[0], widths[1], widths[2])\
%("Name", "Gender", "Year")
print "\n".join(["%-{0}%s%-{1}%s%-{2}%s".format(widths[0], widths[1], widths[2])\
%(attr[0], attr[1], attr[2]) for attr in pep])

Categories