Update CSV file by column name using Python - python

I have the csv file as follows:
product_name, product_id, category_id
book, , 3
shoe, 3, 1
lemon, 2, 4
I would like to update product_id of each row by providing the column name using python's csv library.
So for an example if I pass:
update_data = {"product_id": [1,2,3]}
then the csv file should be:
product_name, product_id, category_id
book, 1, 3
shoe, 2, 1
lemon, 3, 4

You can use your existing dict and iter to take items in order, eg:
import csv
update_data = {"product_id": [1,2,3]}
# Convert the values of your dict to be directly iterable so we can `next` them
to_update = {k: iter(v) for k, v in update_data.items()}
with open('input.csv', 'rb') as fin, open('output.csv', 'wb') as fout:
# create in/out csv readers, skip intial space so it matches the update dict
# and write the header out
csvin = csv.DictReader(fin, skipinitialspace=True)
csvout = csv.DictWriter(fout, csvin.fieldnames)
csvout.writeheader()
for row in csvin:
# Update rows - if we have something left and it's in the update dictionary,
# use that value, otherwise we use the value that's already in the column.
row.update({k: next(to_update[k], row[k]) for k in row if k in to_update})
csvout.writerow(row)
Now - this assumes that each new column value goes to the row number and that the existing values should be used after that. You could change that logic to only use new values when the existing value is blank for instance (or whatever other criteria you wish).

(assuming you're using 3.x)
Python has a CSV module in the standard library which helps read and amend CSV files.
Using that I'd find the index for the column you are after and store it in the dictionary you've made. Once that has been found it's simply a matter of popping the list item into each row.
import csv
update_data = {"product_id": [None, [1,2,3]]}
#I've nested the original list inside another so that we can hold the column index in the first position.
line_no = 0
#simple counter for the first step.
new_csv = []
#Holds the new rows for when we rewrite the file.
with open('test.csv', 'r') as csvfile:
filereader = csv.reader(csvfile)
for line in filereader:
if line_no == 0:
for key in update_data:
update_data[key][0] = line.index(key)
#This finds us the columns index and stores it for us.
else:
for key in update_data:
line[update_data[key][0]] = update_data[key][1].pop(0)
#using the column index we enter the new data into the correct place whilst removing it from the input list.
new_csv.append(line)
line_no +=1
with open('test.csv', 'w') as csvfile:
filewriter = csv.writer(csvfile)
for line in new_csv:
filewriter.writerow(line)

Related

How to merge CSV files such that rows with a unique identifier are added into the same row of the output?

I'm using Python to merge 4 headerless CSV's into one output file.
Each CSV has a unique number in the first column as shown in the 2 example CSV files below:
1.csv
1,Ringo,Beatles
2,John,Beatles
3,Mick,Rolling Stones
4,Keith,Rolling Stones
5,Rivers,Weezer
2.csv
1,TSLA,XNAS,1.0,USD
2,AAPL,XNAS,1.0,USD
3,SPY,ARCX,1.0,USD
4,BP LN,XLON,1.0,GBP
5,ESUD,XCME,1.0,USD
I have generated the output from these CSV's using the following code.
import os
import csv
filenames = ['1.csv', '2.csv', '3.csv', '4.csv']
with open('output_file', 'w') as outfile:
for fname in filenames:
with open(fname) as infile:
outfile.write(infile.read())
This works fine and outputs a file. The data ends up as follows
1,Ringo,Beatles
2,John,Beatles
3,Mick,Rolling Stones
4,Keith,Rolling Stones
5,Rivers,Weezer
1,TSLA,XNAS,1.0,USD
2,AAPL,XNAS,1.0,USD
3,SPY,ARCX,1.0,USD
4,BP LN,XLON,1.0,GBP
5,ESUD,XCME,1.0,USD1,5,-600,1043.22,-625932.00
3,5,200,304.89,60978.00
5,4,6,3015.25,904575.005,4,-1,2,3009.50
5,4,1,1,3011.75
4,3,1,1000,308.37
4,3,1,200,309.15
1,3,1,100,309.0125
Is there a way to use the first column number as a 'unique' number to link the data, such that it takes the three results that start with '1', and adds them to the same row?
For example, these have the same 'unique' number '1':
1,Ringo,Beatles
1,TSLA,XNAS,1.0,USD
1,3,1,100,309.0125
The resulting row would be:
(1) Ringo,Beatles,TSLA,XNAS,1.0,USD,3,1,100,309.0125
You could use dictionary to put all data as
{
1: [1, "Ringo", "Beatles", 1, "TSLA", "XNAS", 1.0, "USD", 1, 3, 1, 100, 309.0125],
2: [2, ...],
3: [3, ...],
...
}
and later write all in new file.
So first create empty dictionary. ie. new_rows = {}
Next get row from file, get ID and check if it exists in dictionary. If not exists then create it with list with has only ID new_rows[key] = [key]
Next you can add other values from row to this list new_rows[key] += values
Repeate it for all rows in all files.
And later you can use this dictionary to write all rows to new file.
I use io only to simulate files in memory but you should use open()
text1 = '''1,Ringo,Beatles
2,John,Beatles
3,Mick,Rolling Stones
4,Keith,Rolling Stones
5,Rivers,Weezer'''
text2 = '''1,TSLA,XNAS,1.0,USD
2,AAPL,XNAS,1.0,USD
3,SPY,ARCX,1.0,USD
4,BP LN,XLON,1.0,GBP
5,ESUD,XCME,1.0,USD'''
import os
import csv
import io
new_rows = {} # dict
filenames = [text1, text2]
#filenames = ['1.csv', '2.csv', '3.csv', '4.csv']
for fname in filenames:
#with open(fname) as infile:
with io.StringIO(fname) as infile:
reader = csv.reader(infile)
for row in reader:
key = row[0] # ID
values = row[1:] # rest
# create key if not exists
if key not in new_rows:
new_rows[key] = [key]
new_rows[key] += values # add two lists
# OR
#if key not in new_rows:
# new_rows[key] = values # only for first file
#else:
# new_rows[key] += values # for other file - add two lists
# --- write it ---
with open('output_file', 'w') as outfile:
writer = csv.writer(outfile)
all_rows = new_rows.values()
writer.writerows(all_rows) # `writerows` with `s` to write list with many rows.
BTW:
In older Python dict didn't have to keep order so it could write new rows in different order - and it would need to sort list of rows before saving or it would need to use collections.OrderedDict()

How to use python to read excel column data and print column duplicates

Two columns ("Name" & "Value") in excel.
There are duplicates (eg. "xxa","xxf") in the Value column and the python script needs to find what are the duplicates cell values and put them into an array
The output should be
{
"xxa": ["aaa","bbb","ccc","hhh"],
"xxf": ["fff","jjj"]
}
How to improve the current script?
file = open('columnData.csv')
csvreader = csv.reader(file)
next(csvreader)
for row in csvreader:
name = row[0]
value = row[1]
value_col.append(value)
name_value_col.append(name+","+value)
file.close()
count={}
names=[]
for item in value_col:
if value_col.count(item)>1:
count[item]=value_col.count(item)
for name,value in count.items():
names.append(name)
total=[]
for item in name_value_col:
item_name=item.split(",")
if item_name[1] in names:
total.append(item_name[0])
print(total)
I'd recommend using defaultdict, and while you're at it using csv.DictReader makes for more legible code:
import csv
from collections import defaultdict
data = defaultdict(list)
with open('columnData.csv') as f:
reader = csv.DictReader(f)
for row in reader:
data[row['Value']].append(row['Name'])
and then regarding duplicate finding you can EITHER take the destructive approach (pruning non-duplicates)
# Remove non-duplicates here
for key in list(data.keys()): # note need to take a copy of the keys
if len(data[key]) == 1: # only one value in the list
del data[key]
print(dict(data))
>>> {"xxa": ["aaa","bbb","ccc","hhh"], "xxf": ["fff","jjj"]}
or if you prefer a non-destructive approach to finding duplicates:
def _filter_duplicates(data):
for key, value in data.items():
if len(value) > 1:
yield key, value
def find_duplicates(data):
return dict(_filter_duplicates(data))
print(find_duplicates(data))
>>> {"xxa": ["aaa","bbb","ccc","hhh"], "xxf": ["fff","jjj"]}

Create dictionary from CSV where column names are keys

I'm trying to read a csv file (actually a tsv, but nvm) and set it as a dictionary where its key are the column names of said csv and the rest of the rows are values for those keys.
I also have some comments marked by the '#' character, which I intend to ignore:
csv_in.csv
##Some comments
##Can ignore these lines
Location Form Range <-- This would be the header
North Dodecahedron Limited <---|
East Toroidal polyhedron Flexible <------ These lines would be lists
South Icosidodecahedron Limited <---|
The main idea is to store them like this:
final_dict = {'Location': ['North','East','South'],
'Form': ['Dodecahedron','Toroidal polyhedron','Icosidodecahedron'],
'Range': ['Limited','Flexible','Limited']}
So far I could come close like so:
tryercode.py
import csv
dct = {}
# Open csv file
with open(tsvfile) as file_in:
# Open reader instance with tab delimeter
reader = csv.reader(file_in, delimiter='\t')
# Iterate through rows
for row in reader:
# First I skip those rows that start with '#'
if row[0].startswith('#'):
pass
elif row[0].startswith('L'):
# Here I try to keep the first row that starts with the letter 'L' in a separate list
# and insert this first row values as keys with empty lists inside
dictkeys_list = []
for i in range(len(row)):
dictkeys_list.append(row[i])
dct[row[i]] = []
else:
# Insert each row indexes as values by the quantity of rows
print('¿?')
So far, the dictionary's skeleton looks fine:
print(dct)
{'Location': [], 'Form': [], 'Range': []}
But everything I tried so far failed to append the values to the keys' empty lists the way it is intended. Only could do so for the first row.
(...)
else:
# Insert each row indexes as values by the quantity of rows
print('¿?')
for j in range(len(row)):
dct[dictkeys_list[j]] = row[j] # Here I indicate the intented key of the dict through the preoviously list of key names
I searched far and wide stackoverflow but couldn't find it for this way (the code template is inspired by an answer at this post, but the dictionary is of a different structure.
Using collections.defaultdict, we can create a dictionary that automatically initialises its values as lists. Then we can iterate over a csv.DictReader to populate the defaultdict.
Given this data:
A,B,C
a,b,c
aa,bb,cc
aaa,bbb,ccc
This code
import collections
import csv
d = collections.defaultdict(list)
with open('myfile.csv', 'r', newline='') as f:
reader = csv.DictReader(f)
for row in reader:
for k, v in row.items():
d[k].append(v)
print(d)
Produces this result:
defaultdict(<class 'list'>, {'A': ['a', 'aa', 'aaa'],
'B': ['b', 'bb', 'bbb'],
'C': ['c', 'cc', 'ccc']})
I amend something in your code and run it. Your code can work with the right result.
The code is below
import csv
dct = {}
# Open csv file
tsvfile="./tsv.csv" # This is the tsv file path
with open(tsvfile) as file_in:
# Open reader instance with tab delimeter
reader = csv.reader(file_in, delimiter='\t')
for row in reader:
# First I skip those rows that start with '#'
if row[0].startswith('#'):
pass
elif row[0].startswith('L'):
# Here I try to keep the first row that starts with the letter 'L' in a separate list
# and insert this first row values as keys with empty lists inside
dictkeys_list = []
for i in range(len(row)):
dictkeys_list.append(row[i])
dct[row[i]] = []
else:
# Insert each row indexes as values by the quantity of rows
for i in range(len(row)):
dct[dictkeys_list[i]].append(row[i])
print(dct)
# Iterate through rows
Running result like this
Besides, I amend your further like below, I think the code can deal with more complicated situation
import csv
dct = {}
# Open csv file
tsvfile="./tsv.csv" # This is the tsv file path
is_head=True # judge if the first line
with open(tsvfile) as file_in:
# Open reader instance with tab delimeter
reader = csv.reader(file_in, delimiter='\t')
for row in reader:
# First I skip those rows that start with '#'
# Use strip() to remove the space char of each item
if row.__len__()==0 or row[0].strip().startswith('#'):
pass
elif is_head:
# Here I try to keep the first row that starts with the letter 'L' in a separate list
# and insert this first row values as keys with empty lists inside
is_head=False
dictkeys_list = []
for i in range(len(row)):
item=row[i].strip()
dictkeys_list.append(item)
dct[item] = []
else:
# Insert each row indexes as values by the quantity of rows
for i in range(len(row)):
dct[dictkeys_list[i]].append(row[i].strip())
print(dct)
# Iterate through rows
Hi you can try the pandas library.
import pandas as pd
df = pd.read_csv("csv_in.csv")
df.to_dict(orient="list")
To reproduce this, I have created a csv file with below content and saved as 'csvfile.csv'.
Location,Form,Range
North,Dodecahedron,Limited
East,Toroidal polyhedron,Flexible
South,Icosidodecahedron,Limited
Now to achieve your goal, I have used pandas library as below:
import pandas as pd
df_csv = pd.read_csv('csvfile.csv')
dict_csv = df_csv.to_dict(orient='list')
print(dict_csv)
and here's the output as you needed:
{'Location': ['North', 'East', 'South'],
'Form': ['Dodecahedron', 'Toroidal polyhedron', 'Icosidodecahedron'],
'Range': ['Limited', 'Flexible', 'Limited']}
Hope, this helps.

Writing Python's List or Dictionary into CSV file (Row containing More than One Column)

I am just starting up Python!!
i want to make a CSV file where i have a dictionary and i want to print each member of it in its own column in the same row.
like i have an array of dictionaries and i want each row to represent one of them and each column of each row to represent an item inside.
import csv
"... we are going to create an array of dictionaries and print them all..."
st_dic = []
true = 1
while true:
dummy = input("Please Enter name, email, mobile, university, major")
x = dummy.split(",")
if "Stop" in x:
break
dict ={"Name":x[0],"Email":x[1],"Mobile":x[2],"University":x[3],"Major":x[4]}
st_dic.append(dict)
f2 = open("data.csv" , "w")
with open("data.csv", "r+") as f:
writer = csv.writer(f)
for item in st_dic:
writer.writerow([item["Name"], item["Email"], item["Mobile"] , item["University"] , item["Major"]])
f.close()
the thing i output now is a row which contains the data in the first dictionary, i just want them seperated, each in its own column within its row.
It is surprising there are so many questions here that try to fill in some data in while loop and input() command. In all the fairness, this is not python best use case.
Imagine you had the dictionary just filled in your code:
dict1 = {'name': "Kain", 'email': 'make_it_up#something.com'}
dict2 = {'name': "Abel", 'email': 'make_it_up2#otherthing.com'}
dict_list = [dict1, dict2]
After that you can export to csv easily:
import csv
with open('data.csv', 'w') as f:
w = csv.DictWriter(f, ['name', 'email'], lineterminator='\n')
w.writeheader()
for row in dict_list:
w.writerow(row)
Note there are many questiona about csv module on SO
as well as there are examples in documentation.

Write data from one csv to another python

I have three CSV files with attributes Product_ID, Name, Cost, Description. Each file contains Product_ID. I want to combine Name (file1), Cost(file2), Description(File3) to new CSV file with Product_ID and all three above attributes. I need efficient code as files contains over 130000 rows.
After combining all data to new file, I have to load that data in a dictionary.
Like: Product_Id as Key and Name,Cost,Description as Value.
It might be more efficient to read each input .csv into a dictionary before creating your aggregated result.
Here's a solution for reading in each file and storing the columns in a dictionary with Product_IDs as the keys. I assume that each Product_ID value exists in each file and that headers are included. I also assume that there are no duplicate columns across the files aside from Product_ID.
import csv
from collections import defaultdict
entries = defaultdict(list)
files = ['names.csv', 'costs.csv', 'descriptions.csv']
headers = ['Product_ID']
for filename in files:
with open(filename, 'rU') as f: # Open each file in files.
reader = csv.reader(f) # Create a reader to iterate csv lines
heads = next(reader) # Grab first line (headers)
pk = heads.index(headers[0]) # Get the position of 'Product_ID' in
# the list of headers
# Add the rest of the headers to the list of collected columns (skip 'Product_ID')
headers.extend([x for i,x in enumerate(heads) if i != pk])
for row in reader:
# For each line, add new values (except 'Product_ID') to the
# entries dict with the line's Product_ID value as the key
entries[row[pk]].extend([x for i,x in enumerate(row) if i != pk])
writer = csv.writer(open('result.csv', 'wb')) # Open file to write csv lines
writer.writerow(headers) # Write the headers first
for key, value in entries.items():
writer.writerow([key] + value) # Write the product IDs
# concatenated with the other values
A general solution that produces a record, maybe incomplete, for each id it encounters processing the 3 files needs the use of a specialized data structure that fortunately is just a list, with a preassigned number of slots
d = {id:[name,None,None] for id, name in [line.strip().split(',') for line in open(fn1)]}
for line in open(fn2):
id, cost = line.strip().split(',')
if id in d:
d[id][1] = cost
else:
d[id] = [None, cost, None]
for line in open(fn3):
id, desc = line.strip().split(',')
if id in d:
d[id][2] = desc
else:
d[id] = [None, None, desc]
for id in d:
if all(d[id]):
print ','.join([id]+d[id])
else: # for this id you have not complete info,
# so you have to decide on your own what you want, I have to
pass
If you are sure that you don't want to further process incomplete records, the code above can be simplified
d = {id:[name] for id, name in [line.strip().split(',') for line in open(fn1)]}
for line in open(fn2):
id, cost = line.strip().split(',')
if id in d: d[id].append(name)
for line in open(fn3):
id, desc = line.strip().split(',')
if id in d: d[id].append(desc)
for id in d:
if len(d[id])==3: print ','.join([id]+d[id])

Categories