Python List Wrangling - python

I take a list that looks like this
[[x, a, d], [y], [z,f]]
It wrangles the list by this
#tag to print is the raw JSON
for tagElement in tagToPrint[x]:
if tagElement not in allConstrainedTagList:
if v == 0:
topicTagConcat = [tagElement]
topicTagToPrint[x] = tagElement
v = v + 1
elif v >= 1:
topicTagConcat = [topicTagConcat, tagElement]
topicTagToPrint[x] = topicTagConcat
v = v + 1
which then uses this
with open("output.csv", 'wb') as outcsv:
#configure writer to write standard csv file
writer = csv.writer(outcsv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
writer.writerow(['Topic Tag', ..... ])
while x < y:
writer.writerow(rowPrinter[x])
x = x + 1
to print this out
ID ColA ColB ColC
1 X A D
2 Y
3 Z F
How can I analyze A & B & C together as one column? X, Y, Z, A, B, C are
the same column. It is going in tableau.

Related

Array of arrays from a dat file

I have a dat file like this:
# donnees t,x,y pour uid=12345
3.949218750000000000e+00 1.442828613651609082e+00 -8.307779446951960578e-01
1.125000000000000000e+00 1.005202962469742722e+00 5.323792795519278753e-01
3.281250000000000000e-01 1.133892308149918815e+00 1.321436062441114778e+00
5.449218750000000000e+00 -1.568679619747660459e+00 1.225514134192944526e+00
.....
.....
.....
And I would like to extract and read the data in a specific way. I have to define each column as T, X, and Y, in arrays, and then the function I use must return an array containing the 3 arrays when typing return T,X,Y
I've tried this for now:
def lecture(fichier):
data = np.loadtxt('mon_fichier.dat', usecols=(0,1,2))
print(data,data.shape)
T = data[0]
X = data[1]
Y = data[2]
return T,X,Y
But it returns me 3 arrays, not an array containing the 3 arrays.
Any idea about how to proceed?
EDIT: here is how I did it:
def lecture(fichier):
with open(fichier, 'r') as f:
data = f.readlines()
N = len(data)
T = np.zeros(N-1)
X = np.zeros(N-1)
Y = np.zeros(N-1)
for i in range(1,N):
line = data[i]
d = line.split()
T[i-1] = float(d[0])
X[i-1] = float(d[1])
Y[i-1] = float(d[2])
print(T.shape)
return T,X,Y
May be you want this:
import numpy as np
def lecture():
data = np.loadtxt('test.txt', usecols=(0,1,2))
T = [[x] for x in data[0]]
X = [[x] for x in data[1]]
Y = [[x] for x in data[2]]
return T,X,Y
data = lecture()
print(data)
print(data[0])
Output:
#complete data
([[3.94921875], [1.442828613651609], [-0.8307779446951961]],
[[1.125], [1.0052029624697427], [0.5323792795519279]],
[[0.328125], [1.1338923081499188], [1.3214360624411148]])
#data[0]
[[3.94921875], [1.442828613651609], [-0.8307779446951961]]

compare the first word from 2 text files

I have these 2 csv files. The first word is the "key" and I need to compare the old file t1 with the new one t2. If the key match, I need to display the content from new file.
cat /tmp/t1.txt
a, b, c
d, e, f
g, m, p
cat /tmp/t2.txt
d, x, y
g, h, i
But the way this loop is written, it shows the entry from old file for the key 'g' while it works correctly for the key 'd'.
with open("/tmp/t1.txt", "r") as f:
with open("/tmp/t2.txt", "r") as n:
for nline in n:
for fline in f:
if nline.split()[0] == fline.split()[0]:
print("nline", nline)
else:
print("fline", fline)
The result is:
fline a, b, c
nline d, x, y
fline g, m, p
The last line should look like this:
nline g, h, i
Basically you have to reopen the file once you read through it:
with open("/tmp/t2.txt", "r") as n:
for nline in n:
with open("/tmp/t1.txt", "r") as f: # <<-- moved this line here
for fline in f:
# some comparisons here
Another good approach would be to read both files once, and then compare the data:
with open("/tmp/t1.txt", "r") as fin :
data1 = fin.readlines()
with open("/tmp/t2.txt", "r") as fin :
data2 = fin.readlines()
for nline in data1 :
for fline in data2 :
# put your logic here
ok, answering the question from the comments:
with open("/tmp/t1.txt", "r") as fin :
data1 = [ (i[0], i) for i in fin.readlines() if len(i) > 3 ]
with open("/tmp/t2.txt", "r") as fin :
data2 = { i[0] : i for i in fin.readlines() if len(i) > 3 }
for key,value in data1 :
print data2[key] if key in data2 else value
Using pandas
Caveat: keys in t2 are unique
Generates a dataframe with rows updated where the t1 key matches the t2 key
import pandas as pd
# read files
t1 = pd.read_csv('t1.txt', header=None)
t2 = pd.read_csv('t2.txt', header=None)
# t2 to a dict
t2_d = t2.set_index(0, drop=False).to_dict(orient='index')
# look for t1 in t2_d
t1.apply(lambda x: t2_d[x[0]] if x[0] in t2_d else x, axis=1)
0 1 2
0 a b c
1 d x y
2 g h i
Optionally
This option adds a column, in_t2 to indicate whether t1 is in t2
Create updated and join it to t1 to easily verify updated values
# read files
t1 = pd.read_csv('t1.txt', header=None)
t2 = pd.read_csv('t2.txt', header=None)
# add Boolean column
t1['in_t2'] = t1[0].isin(t2[0])
# t2 to a dict
t2_d = t2.set_index(0, drop=False).to_dict(orient='index')
# look for t1 in t2_d
updated = t1.iloc[:, :-1].apply(lambda x: t2_d[x[0]] if x[0] in t2_d else x, axis=1)
# join t1 and updated
pd.concat([t1, updated], axis=1)
0 1 2 in_t2 0 1 2
0 a b c False a b c
1 d e f True d x y
2 g m p True g h i

splitting dictionary and writing it to different csv file in python

I want to split the python dictionary and write it to different files based on NO_OF_LINES_PER_FILE and size of dictionary
Input
NO_OF_LINES_PER_FILE
so if NO_OF_LINES_PER_FILE = 2 and size of dictionary is 10 the i want the dictionary to be splitted into 5 files(each file will have 2 rows)
Script
import csv
NO_OF_LINES_PER_FILE = 2
s = {"2222":["1","2","3"],"3456":["2","3","4"],"5634":["4","5"],"23543":["456","3"],"29587":["4","5"],"244":["23","34"],"455":["3","4"],"244221":["5"],"23232345":["2323","43"]}
def again(c,h,NO_OF_LINES_PER_FILE1):
f3 = open('num_'+str(h)+'.csv', 'at')
if c == 1:
ceh = 2
else:
ceh = c
print ceh
v = 0
for w in s:
v = v + 1
if v < ceh:
pass
elif v > NO_OF_LINES_PER_FILE1:
print "yes"
NO_OF_LINES_PER_FILE1 = NO_OF_LINES_PER_FILE1 + 1
h = NO_OF_LINES_PER_FILE1 + 1
again(c,h,NO_OF_LINES_PER_FILE1)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(s[w])
c = c + 1
def split():
f3 = open('has_'+str(NO_OF_LINES_PER_FILE)+'.csv', 'at')
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
c = 0
for w in s:
if c >= NO_OF_LINES_PER_FILE:
NO_OF_LINES_PER_FILE1 = NO_OF_LINES_PER_FILE + 1
h = NO_OF_LINES_PER_FILE
again(c,h,NO_OF_LINES_PER_FILE1)
break
else:
#print NO_OF_LINES_PER_FILE
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(s[w])
c = c + 1
split()
But this script is not working and creates many files
In the above script NO_OF_LINES_PER_FILE = 2 and size of dictionary s is 9
so i want 5 files first four file will contain 2 rows each and fifth file will contain 1 row
How can i solve this problem?
my method is flat the dict first, then split flat dict to sub list with length you want
import csv
flatDict = [ i for i in s.items()]
splitFlatDict = [flatDict[i:i+NO_OF_LINES_PER_FILE] for i in xrange(0,len(flatDict),NO_OF_LINES_PER_FILE)]
for i,rows in enumerate(splitFlatDict):
with open(str(i) + '.csv','wb') as f:
writer = csv.writer(f)
writer.writerows(rows)

Python - looping multiple lists with enumerate for same index

list1 = Csvfile1._getRow(' field1')
list2 = Csvfile2._getRow(' field1')
_list1 = Csvfile1._getRow(' field2')
_list2 = Csvfile2._getRow(' field2')
for i,(a,b) in enumerate(zip(list2, list1)):
value = False
if field == ' field1':
for j,(c,d) in enumerate(zip(_list2, _list1)):
if i == j:
if a != b and c != d:
value = True
else:
value = False
break
if value == True:
continue
if a != b
# do something
Below is the sample :
values in both the csv files are compared. when the value for field1
is not equal in both csv files, the condition if a != b: should be executed.
When the value for field1 is not equal in both csv files, and at the same time if the values for field2 is also not equal -> then the condition if a != b: should not be executed.
With huge data this seems to be not working. Or is there a better way to achieve this ?
Csvfile1
field1 | field2
222 | 4 -> enter if a != b: condition loop
435 | 5 -> do not enter if a != b: condition loop
Csvfile2
field1 | field2
223 | 4
436 | 6
If I got right what you want to do, try something like this:
$ cat t1.txt
field1|field2
222|4
435|5
$ cat t2.txt
field1|field2
223|4
436|6
$ python
import csv
with open("t1.txt", "rb") as csvfile:
with open("t2.txt", "rb") as csvfile2:
reader = csv.reader(csvfile, delimiter='|')
reader2 = csv.reader(csvfile2, delimiter='|')
for row1, row2 in zip(reader, reader2):
for elem1, elem2 in zip(row1, row2):
if elem1 != elem2:
print "different: {} and {}".format(elem1, elem2)
different: 222 and 223
different: 435 and 436
different: 5 and 6
#first field(ff) second field(sf) first file(ff) second file(sf)
field1csv1 = Csvfile1._getRow(' field1')
field1csv2 = Csvfile2._getRow(' field1')
field2csv1 = Csvfile1._getRow(' field2')
field2csv2 = Csvfile2._getRow(' field2')
Every time you have huge lists of data you should think about using a generator instead of a list comprehension. itertools.izip is a generator version of zip.
Plugging it in should give you a considerable improvement, as no temporary lists will be generated:
from itertools import izip
for i, (a, b) in enumerate(izip(list2, list1)):
value = False
if field == ' field1':
for j, (c, d) in enumerate(izip(_list2, _list1)):
if i == j:
if a != b and c != d:
value = True
else:
value = False
break
if value == True:
continue
if a != b
# do something
This is an example of how to refactor your code to get rid of the iteration in python and drop the iteration to the C level:
#orig
for i, (a, b) in enumerate(zip(list2, list1)):
value = False
if field == ' field1':
for j, (c, d) in enumerate(zip(_list2, _list1)):
if i == j:
if a != b and c != d:
value = True
else:
value = False
break
With generators:
from itertools import izip
mygen = izip(izip(list2,list1),izip(_list2,_list1))
#[((a, b), (c, d)), ((x, y), (_x, _y)), ...]
values = [tuple1[0]!=tuple1[1] and tuple1[2]!=tuple2[1] for tuple1, tuple2 in mygen]
Also you could use "equality" generators:
field1 = izip(field1csv1, field1csv2)
field2 = izip(field2csv1, field2csv2)
field1equal = (f[0] == f[1] for f in field1)
field2equal = (f[0] == f[1] for f in field2)
I got this far and then gave up. I have no idea what you're doing.

How to separate rows of a csv file

I'm having a problem in rewriting a CSV file. What I had was a CSV file with 20 columns and I rewrote it to only 5. Now, I need to take out a couple of unnecessary points, where SN < 20. It works, the only problem is that it doesn't separate the rows. It puts everything in row 1. I'm guessing that its from the,
output_ary.append(row)
but I don't know what else to write there. Here is a part of the code:
import csv
import os
import matplotlib.pyplot as plt
os.chdir("C:\Users\Robert\Documents\qwe")
r = csv.reader(open("gdweights_feh_robert_cmr.csv",'rU'))
w = csv.writer(open("gdweight.csv",'wb',buffering=0))
zerovar2 = 0
for row in r:
if zerovar2==0:
zerovar2 = zerovar2 + 1
else:
sn = float(row[11])
rweight = float(row[17])
tarweight = float(row[18])
fehadop = float(row[25])
weight = rweight*tarweight*fehadop
w.writerow([sn,rweight,tarweight,fehadop,weight])
output_ary = []
with open("gdweight.csv",'rU') as f:
reader = csv.reader(f, delimiter= ',')
zerovar = 0
for row in reader:
if zerovar==0:
zerovar = zerovar + 1
else:
sn = row [0]
zerovar = zerovar + 1
x = float(sn)
if x > 20:
output_ary.append(row)
with open("ouput1.csv",'w') as f2:
for row in output_ary:
for item in row:
f2.write(item + ",")
with open("ouput1.csv",'w') as f2:
for row in output_ary:
for item in row:
f2.write(item + ",")
f2.write("\n") # this is what you're missing
Simple rewrite the last as :
with open("ouput1.csv",'w') as f2:
for row in output_ary:
f2.write(",".join([str(e) for e in item] + '\n')
Now here are a couple of additional comments :
you can use enumerate instead of using a counter :
for i_row, row in enumerate(r) :
...
you can also use a csv writer :
with open("output.txt", "w") as f :
csv_w = csv.writer(f)
for i_row, row in enumerate(output) :
if i_row== 0 :
continue
csv_w.writerow(row)

Categories