Storing a variable number of values per item - python

I want to read and store in a defaultdict(list) a csv file:
Pos ID Name
1 0001L01 50293
2 0002L01 128864
3 0003L01 172937
4 0004L01 12878
5 0005L01 demo
6 0004L01 12878
7 0004L01 12878
8 0005L01 demo
I would like the ID to be my keys and as values Pos and Name. However the number of Pos varies. For instance ID 0005L01 contains Pos 8 and 5 whereas 0001L01 contains Pos 1. Is there a way of doing that?
So far I got:
reader = csv.reader(open("sheet.csv", "rb"))
for row in reader:
if any(row):
dlist.append(row)
for k, g in groupby(zip(mylist, itertools.count()), key=lambda x: x[0][1]):
map(lambda x: d[k].append((x[0][0], x[1], x[0][2])), g)

You can use dict.setdefault method to create the expected dictionary:
import csv
d={}
with open('my_file.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ')
for row in spamreader:
try :
Pos,ID,Name=row
d.setdefault(ID,[]).append([Pos,Name])
except ValueError :
continue
result:
{'0001L01': [['1', '50293']],
'0003L01': [['3', '172937']],
'0002L01': [['2', '128864']],
'0005L01': [['5', 'demo'], ['8', 'demo']],
'0004L01': [['4', '12878'], ['6', '12878'], ['7', '12878']]}
As #tobias_k says, if you have not pos columns in your file you can use enumerate to create it manually :
import csv
d={}
with open('my_file.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ')
for Pos,row in enumerate(spamreader,1):
try :
ID,Name=row
d.setdefault(ID,[]).append([Pos,Name])
except ValueError :
continue

Related

Python Array To Csv, Group data from a CSV file by field value and save csv

i have a csv file
col1 col2
a 1
a 2
a 3
b 3
b 6
b 1
i use this code
import csv
result = {}
with open('data.csv', 'rb') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in csvreader:
if row[0] in result:
result[row[0]].append(row[1])
else:
result[row[0]] = [row[1]]
print result
output
result:{
'a': ['1', '2', '3'],
'b': ['3', '6', '1']
}
Now i want save this result to csv file(i want use this method)
result.csv
a b
1 3
2 6
3 1
SOLVED
df_zor = pd.read_csv('5_gecici_.csv', encoding = "windows-1250", sep=';')
df = df_zor.groupby("gorevli").agg(lambda x: x.unique().tolist()).T
df.apply(lambda x: np.pad(x.iloc[0], (0, df.apply(lambda x: len(x.iloc[0]), axis=0).max() - len(x.iloc[0])), 'constant', constant_values=np.nan), axis=0)[df_zor.gorevli.unique()].to_excel('5_gorevli_bazinda_incelemede_dosya_listesi.xlsx', index=False)
You can use the following code:
to_csv = []
#to get 'a' and 'b' in first row
to_csv.append(list(result.keys()))
# To get 'a' and 'b' values
for a,b in zip(*list(result.values())):
to_csv.append([a,b])
# Writing result.csv
with open('result.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(to_csv)
The data in result.csv is :
a,b
1,3
2,6
3,1
You can use pandas
import pandas as pd
results_df = pd.DataFrame(data=result)
results_df.to_csv("result.csv", index=False)
Just add this at the end:
with open('output.csv', 'w') as fw:
cw = csv.writer(fw, delimiter=',')
row_list = list(result.keys()) + [x for x in zip(*result.values())]
cw.writerows(row_list)
EDIT: I see my answer is a bit late. My answer is identical to #Mohnish answer except I use a "list comprehension" instead of explicitly looping through the zip object. #Mohnish answer has more explanation so go with that.
EDIT: Basically, instead of this:
# To get 'a' and 'b' values
for a,b in zip(*list(result.values())):
to_csv.append([a,b])
do this:
to_csv += [x for x in zip(*result.values())]

Python unique values per column in csv file row

Crunching on this for a long time. Is there an easy way using Numpy or Pandas or fixing my code to get the unique values for the column in a row separated by "|"
I.e the data:
"id","fname","lname","education","gradyear","attributes"
"1","john","smith","mit|harvard|ft|ft|ft","2003|207|212|212|212","qa|admin,co|master|NULL|NULL"
"2","john","doe","htw","2000","dev"
Output should be:
"id","fname","lname","education","gradyear","attributes"
"1","john","smith","mit|harvard|ft","2003|207|212","qa|admin,co|master|NULL"
"2","john","doe","htw","2000","dev"
My broken code:
import csv
import pprint
your_list = csv.reader(open('out.csv'))
your_list = list(your_list)
#pprint.pprint(your_list)
string = "|"
cols_no=6
for line in your_list:
i=0
for col in line:
if i==cols_no:
print "\n"
i=0
if string in col:
values = col.split("|")
myset = set(values)
items = list()
for item in myset:
items.append(item)
print items
else:
print col+",",
i=i+1
It outputs:
id, fname, lname, education, gradyear, attributes, 1, john, smith, ['harvard', 'ft', 'mit']
['2003', '212', '207']
['qa', 'admin,co', 'NULL', 'master']
2, john, doe, htw, 2000, dev,
Thanks in advance!
numpy/pandas is a bit overkill for what you can achieve with csv.DictReader and csv.DictWriter with a collections.OrderedDict, eg:
import csv
from collections import OrderedDict
# If using Python 2.x - use `open('output.csv', 'wb') instead
with open('input.csv') as fin, open('output.csv', 'w') as fout:
csvin = csv.DictReader(fin)
csvout = csv.DictWriter(fout, fieldnames=csvin.fieldnames, quoting=csv.QUOTE_ALL)
csvout.writeheader()
for row in csvin:
for k, v in row.items():
row[k] = '|'.join(OrderedDict.fromkeys(v.split('|')))
csvout.writerow(row)
Gives you:
"id","fname","lname","education","gradyear","attributes"
"1","john","smith","mit|harvard|ft","2003|207|212","qa|admin,co|master|NULL"
"2","john","doe","htw","2000","dev"
If you don't care about the order when you have many items separated with |, this will work:
lst = ["id","fname","lname","education","gradyear","attributes",
"1","john","smith","mit|harvard|ft|ft|ft","2003|207|212|212|212","qa|admin,co|master|NULL|NULL",
"2","john","doe","htw","2000","dev"]
def no_duplicate(string):
return "|".join(set(string.split("|")))
result = map(no_duplicate, lst)
print result
result:
['id', 'fname', 'lname', 'education', 'gradyear', 'attributes', '1', 'john', 'smith', 'ft|harvard|mit', '2003|207|212', 'NULL|admin,co|master|qa', '2', 'john', 'doe', 'htw', '2000', 'dev']

How can I turn a csv file into a list of list in python

I want to be able to turn csv file into a list of lists with the column values for each list. For example:
6,2,4
5,2,3
7,3,6
into
[[6,5,7],[2,2,3],[4,3,6]]
Ive only managed to open the file and only having success printing it as rows
with open(input,'rb') as csvfile:
csv_file = csv.reader(csvfile)
header = csv_file.next()
raw_data = csv_file
In case you sure it's fixed number of items in each row, you can use zip:
import csv
with open('test.csv') as csvfile:
rows = csv.reader(csvfile)
res = list(zip(*rows))
print(res)
# [('6', '5', '7'), ('2', '2', '3'), ('4', '3', '6')]
Or in case it's different number of items in row:
6,2,4
5,2
7
Use zip_longest and filter:
import csv
from itertools import zip_longest
with open('test.txt') as csvfile:
rows = csv.reader(csvfile)
res = list(zip_longest(*rows))
print(res)
# [('6', '5', '7'), ('2', '2', None), ('4', None, None)]
res2 = [list(filter(None.__ne__, l)) for l in res]
print(res2)
# [['6', '5', '7'], ['2', '2'], ['4']]
You could probably start by reading it into a list of lists first:
from csv import reader as csvreader
with open(input, 'r') as fp:
reader = csvreader(fp)
li = list(reader)
Then chop it into a new sequence, I'm sure there are other tricks with itertools but this is what I came up with:
from itertools import count
def my_gen():
for i in count():
try:
yield [x[i] for x in li]
except IndexError:
break
You can now turn the generator into a list, which will have the desired columns as rows.
list(my_gen())
Or maybe like this...
from csv import reader
with open('test.csv') as csv_file:
csv_reader = reader(csv_file)
rows = list(csv_reader)
print(rows)

Reading a file in python and creating a list turns out to an empty list

I am using the following code to read from a file:
G={}
for line in fin: # load data, create adj lists
lst=[]
size= len(line)
i=0
while (i<size):
if line[i] in ' \t\r\n': pass
else:
lst.append(int(line[i:i+2].strip()))
i=i+1
i=i+1
G[lst[0] ]=lst[1:]
print(G)
But G always turns out to be empty."foo1,txt" contains the following data:
1 15
2 6 10
3 10
4 9
5
I have writen alternte code for appending data to list from file, you can refer if you want this code:
f = open('filename')
dic = []
for l in f:
v = l.split(' ')
v= map(lambda s: s.strip(), v)
dic=dic+v
print dic
output:
['1', '15', '2', '6', '10', '3', '10', '4', '9', '5', '']
Try this, its most pythonic i think:
fin = open("foo1.txt")
G={}
for line in fin: # load data, create adj lists
lst = line.split()
lst = [int(x) for x in lst]
G[lst[0] ]=lst[1:]
print(G)

Iterate over columns in a text file in python

I have a text file in the following format:
1,"20130219111529","90UP:34","0000","9999","356708","2"
"-2","20130219105824","0001:11","0000","","162_005",""
I want to compare row 1 and row 2 (In this case 1 and -2) for some purpose. To strip out all the quotes and parse this file I have the following code:
if os.path.exists(FileName):
with open(FileName) as File:
for row in csv.reader(File, delimiter= ',', skipinitialspace= True):
print(row)
The following is the output:
['1', '20130219111529', '90UP:34', '0000', '9999', '356708', '2']
['-2', '20130219105824', '0001:11', '0000', '', '162_005', '']
I want to iterate through the columns. For example, iterate through '1' then '-2' and so on.
How do I go about doing this?
Use zip(). It turns two iterables into one iterable of tuples, with elements coming from both lists.
l1 = ['1', '20130219111529', '90UP:34', '0000', '9999', '356708', '2']
l2 = ['-2', '20130219105824', '0001:11', '0000', '', '162_005', '']
for elem1, elem2 in zip(l1, l2):
print("elem1 is {0} and elem2 is {1}.".format(elem1, elem2)
Perhaps the following.
if os.path.exists(FileName):
with open(FileName) as File:
lastRow = []
# loop over the lines in the file
for row in csv.reader(File, delimiter= ',', skipinitialspace= True):
# saves the first row, for comparison below
if lastRow == []:
lastRow = row
continue
# loop over the columns, if all rows have the same number
for colNum in range(len(row)):
# compare row[colNum] and lastRow[colNum] as you wish
# save this row, to compare with the next row in the loop
lastRow = row
just print the first element in the row:
for row in csv.reader(File, delimiter= ',', skipinitialspace= True):
print(row[0])
EDIT
rows = csv.reader(File, delimiter= ',', skipinitialspace= True)
print len(rows) # how many rows were read from the file
for row in rows:
print(row[0])
If (as you said in the question, though I'm not sure if you wanted this) you want to iterate through the columns, you can do the following:
if os.path.exists(file_name):
with open(file_name) as csv_file:
for columns in zip(*csv.reader(csv_file, delimiter=',', skipinitialspace=True)):
print columns
This will output the following:
('1', '-2')
('20130219111529', '20130219105824')
('90UP:34', '0001:11')
('0000', '0000')
('9999', '')
('356708', '162_005')
('2', '')

Categories