Merging two CSV files by a common column python - python

I am trying to merge two csv files with a common id column and write the merge to a new file. I have tried the following but it is giving me an error -
import csv
from collections import OrderedDict
filenames = "stops.csv", "stops2.csv"
data = OrderedDict()
fieldnames = []
for filename in filenames:
with open(filename, "rb") as fp: # python 2
reader = csv.DictReader(fp)
fieldnames.extend(reader.fieldnames)
for row in reader:
data.setdefault(row["stop_id"], {}).update(row)
fieldnames = list(OrderedDict.fromkeys(fieldnames))
with open("merged.csv", "wb") as fp:
writer = csv.writer(fp)
writer.writerow(fieldnames)
for row in data.itervalues():
writer.writerow([row.get(field, '') for field in fieldnames])
Both files have the "stop_id" column but I'm getting this error back -
KeyError: 'stop_id'
Any help would be much appreciated.
Thanks

Here is an example using pandas
import sys
from StringIO import StringIO
import pandas as pd
TESTDATA=StringIO("""DOB;First;Last
2016-07-26;John;smith
2016-07-27;Mathew;George
2016-07-28;Aryan;Singh
2016-07-29;Ella;Gayau
""")
list1 = pd.read_csv(TESTDATA, sep=";")
TESTDATA=StringIO("""Date of Birth;Patient First Name;Patient Last Name
2016-07-26;John;smith
2016-07-27;Mathew;XXX
2016-07-28;Aryan;Singh
2016-07-20;Ella;Gayau
""")
list2 = pd.read_csv(TESTDATA, sep=";")
print list2
print list1
common = pd.merge(list1, list2, how='left', left_on=['Last', 'First', 'DOB'], right_on=['Patient Last Name', 'Patient First Name', 'Date of Birth']).dropna()
print common

Thanks Shijo.
This is what worked for me after - merged by the first column in each csv.
import csv
from collections import OrderedDict
with open('stops.csv', 'rb') as f:
r = csv.reader(f)
dict2 = {row[0]: row[1:] for row in r}
with open('stops2.csv', 'rb') as f:
r = csv.reader(f)
dict1 = OrderedDict((row[0], row[1:]) for row in r)
result = OrderedDict()
for d in (dict1, dict2):
for key, value in d.iteritems():
result.setdefault(key, []).extend(value)
with open('ab_combined.csv', 'wb') as f:
w = csv.writer(f)
for key, value in result.iteritems():
w.writerow([key] + value)

Related

Split csv file into 2 list depending upon column name using python

I want to split csv file into 2 lists using column name
CSV file:
Molecule Name,SMILES
ZINC53 (Aspirin),CC(=O)Oc1ccccc1C(=O)O
ZINC7460 (Vatalanib),Clc1ccc(Nc2nnc(Cc3ccncc3)c3ccccc23)cc1
ZINC1493878 (Sorafenib),CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)c3)cc2)ccn1
Code:
namelist = list()
smileslist = list()
with open('./file.csv', 'r') as f:
f = csv.reader(f, delimiter=',')
columns = next(f)
type_col1 = columns.index("Molecule Name")
type_col2 = columns.index("SMILES")
for column in f:
if type_col1 == 'Molecule Name':
namelist.append(column)
elif type_col2 == 'SMILES':
smileslist.append(column)
With pandas library you can do it as easily as :
import pandas as pd
df = pd.read_csv("./file.csv")
namelist = df["Molecule Name"].tolist()
smileslist = df["SMILES"].tolist()
print(namelist)
print(smileslist)
Or if you prefer using the csv reader you can do it as follow :
import csv
namelist = list()
smileslist = list()
with open("./file.csv", "r") as f:
f = csv.reader(f, delimiter=',')
columns = next(f)
index_col1 = columns.index("Molecule Name")
index_col2 = columns.index("SMILES")
for column in f:
namelist.append(column[index_col1])
smileslist.append(column[index_col2])

write data from dictionary to csv by python

I have a dictionary as below which has repeated item name, the difference is the value of each part name. i want to write those info to csv with expected result is :
import csv
dict={
'test':['part_name','test1','test2','test3','part_name','test1','test2','test3'],
'value':['partA','12','55','109','partB','14','54','106'],
'lcl':['lcl','10','50','100','lcl','10','50','100'],
'ucl':['ucl','18','60','115','ucl','18','60','115'],
}
tmp={}
for k,v1,v2,v3 in zip(dict["test"],dict["value"],dict["lcl"],dict["ucl"]):
tmp.setdefault(k, []).append([v1,v2,v3])
print(tmp)
with open('table.csv','w') as f:
writer_inline = csv.writer(f, delimiter=',', lineterminator=',')
writer = csv.writer(f, delimiter=',', lineterminator='\n')
writer.writerow(tmp.keys())
writer.writerows(zip(*tmp.values()))
Try the below code to get your desired csv. I would recommend not to use dict as name for your dictionary. I have changed it to d:
import csv
d = {
'test':['part_name','test1','test2','test3','part_name','test1','test2','test3'],
'value':['partA','12','55','109','partB','14','54','106'],
'lcl':['lcl','10','50','100','lcl','10','50','100'],
'ucl':['ucl','18','60','115','ucl','18','60','115'],
}
headers = d['test'][:len(set(d['test']))]
size = len(headers)
d.pop('test', None)
parts = []
for i in d:
parts += [[d[i][j:(j+size)] for j in range(0, len(d['value']), size)]]
rows = []
for part in list(zip(*parts)):
rows += part
with open('table.csv','w') as f:
writer = csv.writer(f, delimiter=',', lineterminator='\n')
writer.writerow(headers)
writer.writerows(rows)

Compare 2 different csv files and output all the changes into a new csv

I have 2 CSVs which are New.csv and Old.csv shown below:
Old.csv
longName,shortName,eventType,number,severity
ACTAGENT201,ACAT201,RES,1,INFO
ACTAGENT202,ACAT202,RES,2,ALERT
ACODE801,AC801,ADMIN,1,MINOR
ACODE802,AC802,ADMIN,2,MINOR
ACODE102,AC102,COMM,2,CRITICAL
ACODE103,AC103,COMM,3,CRITICAL
ACODE104,AC104,COMM,4,CRITICAL
ACODE105,AC105,COMM,5,CRITICAL
ACODE106,AC106,COMM,6,CRITICAL
New.csv
longName,shortName,eventType,number,severity
ACTAGENT201,ACAT201,RES,1,INFO
ACTAGENT202,ACAT202,RES,2,ALERT
ACODE801,AC801,ADMIN,1,MINOR
ACODE802,AC802,ThisHasBeenChanged,2,MINOR
ACODE102,AC102,COMM,2,CRITICAL
ACODE103,AC103,COMM,3,CRITICAL
ACODE104,AC104,COMM,4,THISHASBEENCHANGED
ACODE105,AC105,COMM,5,CRITICAL
ACODE106,AC106,COMM,6,CRITICAL
If there is data in one of the columns in the row that has been modified/changed between the old.csv and the new.csv then that whole row should be appended to the changes.csv like this with each column from old.csv and new.csv beside each other:
I know how to find new and deleted items in the csv, but could not figure out how to get the modified items. Code below:
import csv
def DeletedItems(old_csv, new_csv, changes_csv):
with open(new_csv, newline="", encoding="utf8") as new_fp:
csv_reader = csv.reader(new_fp)
csv_headings = next(csv_reader)
new_long_names = {row[0] for row in csv.reader(new_fp)}
with open(old_csv, newline="", encoding="utf8") as old_fp:
with open(changes_csv, "a", newline="", encoding="utf8") as changes_fp:
writer = csv.writer(changes_fp)
writer.writerow("")
for row in csv.reader(old_fp):
if row[0] not in new_long_names:
writer.writerow(row)
def NewItems(old_csv, new_csv, changes_csv):
with open(old_csv, newline="", encoding="utf8") as old_fp:
csv_reader = csv.reader(old_fp)
csv_headings = next(csv_reader)
old_long_names = {row[0] for row in csv.reader(old_fp)}
with open(new_csv, newline="", encoding="utf8") as new_fp:
with open(changes_csv, "w", newline="", encoding="utf8") as changes_fp:
writer = csv.writer(changes_fp)
for row in csv.reader(new_fp):
if row[0] not in old_long_names:
writer.writerow(row)
NewItems("old.csv", "new.csv", "changes.csv")
DeletedItems("old.csv", "new.csv", "changes.csv")
First, read both CSV files into a dictionary, using the longName values as keys.
import csv
with open(old_csv_file, "r") as fh:
reader = csv.reader(fh)
old_csv = {row[0]: row for row in reader}
with open(new_csv_file, "r") as fh:
reader = csv.reader(fh)
new_csv = {row[0]: row for row in reader}
Then, it's easy to find newly added and deleted keys using set operations.
old_longNames = set(old_csv.keys())
new_longNames = set(new_csv.keys())
# common: set intersection
common_longNames = old_longNames.intersection(new_longNames)
# removed: whatever's in old but not in new
removed_longNames = old_longNames - new_longNames
# added: whatever's in new but not in old
added_longNames = new_longNames - old_longNames
Finally, iterate over the common set to find where there are changes:
changed_longNames = []
for key in common_longNames:
old_row = old_csv[key]
new_row = new_csv[key]
# if any(o != n for o, n in zip(old_row, new_row)):
if old_row != new_row:
# this row has at least one column changed. Do whatever
print(f"LongName {key} has changes")
changed_longNames.append(key)
Or, as a list comprehension:
changed_longNames = [key for key in common_longNames if old_csv[key] != new_csv[key]]
Writing everything to a new csv file is also fairly trivial. Note that the sets don't preserve the order, so you might not get the result in the same order.
with open("deleted.csv", "w") as fh:
writer = csv.writer(fh)
for key in removed_longNames:
writer.writerow(old_csv[key])
with open("inserted.csv", "w") as fh:
writer = csv.writer(fh)
for key in added_longNames:
writer.writerow(new_csv[key])
with open("changed.csv", "w") as fh:
writer = csv.writer(fh)
for key in changed_longNames:
old_row = old_csv[key]
new_row = new_csv[key]
merged_row = []
for oi, ni in zip(old_row, new_row):
merged_row.append(oi)
merged_row.append(ni)
writer.writerow(merged_row)

Python file matching and appending

This is one file result.csv:
M11251TH1230
M11543TH4292
M11435TDS144
This is another file sample.csv:
M11435TDS144,STB#1,Router#1
M11543TH4292,STB#2,Router#1
M11509TD9937,STB#3,Router#1
M11543TH4258,STB#4,Router#1
Can I write a Python program to compare both the files and if line in result.csv matches with the first word in the line in sample.csv, then append 1 else append 0 at every line in sample.csv?
import pandas as pd
d1 = pd.read_csv("1.csv",names=["Type"])
d2 = pd.read_csv("2.csv",names=["Type","Col2","Col3"])
d2["Index"] = 0
for x in d1["Type"] :
d2["Index"][d2["Type"] == x] = 1
d2.to_csv("3.csv",header=False)
Considering "1.csv" and "2.csv" are your csv input files and "3.csv" is the result you needed
The solution using csv.reader and csv.writer (csv module):
import csv
newLines = []
# change the file path to the actual one
with open('./data/result.csv', newline='\n') as csvfile:
data = csv.reader(csvfile)
items = [''.join(line) for line in data]
with open('./data/sample.csv', newline='\n') as csvfile:
data = list(csv.reader(csvfile))
for line in data:
line.append(1 if line[0] in items else 0)
newLines.append(line)
with open('./data/sample.csv', 'w', newline='\n') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(newLines)
The sample.csv contents:
M11435TDS144,STB#1,Router#1,1
M11543TH4292,STB#2,Router#1,1
M11509TD9937,STB#3,Router#1,0
M11543TH4258,STB#4,Router#1,0
With only one column, I wonder why you made it as a result.csv. If it is not going to have any more columns, a simple file read operation would suffice. Along with converting the data from result.csv to dictionary will help in quick run as well.
result_file = "result.csv"
sample_file = "sample.csv"
with open(result_file) as fp:
result_data = fp.read()
result_dict = dict.fromkeys(result_data.split("\n"))
"""
You can change the above logic, in case you have very few fields on csv like this:
result_data = fp.readlines()
result_dict = {}
for result in result_data:
key, other_field = result.split(",", 1)
result_dict[key] = other_field.strip()
"""
#Since sample.csv is a real csv, using csv reader and writer
with open(sample_file, "rb") as fp:
sample_data = csv.reader(fp)
output_data = []
for data in sample_data:
output_data.append("%s,%d" % (data, data[0] in result_dict))
with open(sample_file, "wb") as fp:
data_writer = csv.writer(fp)
data_writer.writerows(output_data)
The following snippet of code will work for you
import csv
with open('result.csv', 'rb') as f:
reader = csv.reader(f)
result_list = []
for row in reader:
result_list.extend(row)
with open('sample.csv', 'rb') as f:
reader = csv.reader(f)
sample_list = []
for row in reader:
if row[0] in result_list:
sample_list.append(row + [1])
else:
sample_list.append(row + [0]
with open('sample.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(sample_list)

Appending dictionary results to csv in python

I am looking to append a dictionary I have to a CSV file were I already have a header line
and if a value doesn't exist I want to write '-999':
SDict ={T1:'A',T2:'B',T4:'D')
where CSV file has header of
T1,T2,T3,T4,T5
7,8,9,10,11
and the expected results are
T1,T2,T3,T4,T5
7,8,9,10,11
A,B,-999,D,-999
I am trying to do so with the code:
import sys
import os
import csv
def GetFileHeader(Fpath):
i=10
ResFile=open (Fpath, 'r+')
HeaderDict={}
r=csv.reader(ResFile)
HeaderList = r.next()
for Header in HeaderList:
HeaderDict[Header]=i+1
print HeaderDict
ResFile.close()
return HeaderDict
Fpath= r'Z:\temp\assaf\S2TTP\S2T_TP\modules\results\Y124\res.csv'
Header= GetFileHeader(Fpath)
with open(Fpath,'rb') as fin:
dr = csv.DictReader(fin, dialect='excel')
print dr
print dr.fieldnames
# dr.fieldnames contains values from first row of `f`.
with open(Fpath,'ab+') as fou:
dw = csv.DictWriter(fou, dialect='excel', fieldnames=dr.fieldnames)
fieldnames=dr.fieldnames
for K in fieldnames:
dw.writerow(Header[k])
I think you can simply do:
import csv
SDict = {'T1': 'A', 'T2': 'B', 'T4': 'D'}
with open('file.csv', 'r+b') as f:
header = next(csv.reader(f))
dict_writer = csv.DictWriter(f, header, -999)
dict_writer.writerow(SDict)
This is assuming you're on Python 2.X. Also, be wary of files which don't end in a newline, or you could end up with a row like 7,8,9,10,11A,B,-999,D,-999.

Categories