I have put the together code to compare and match data between two CSVs and collate that data into a new CSV. The CSVs have similar data but the column names and their positions are different.
When I Run and Debug, it throws the following error at line 42:
Exception has occurred: TypeError
'dict_keys' object is not subscriptable
File "D:\Documents\Python\iris\accountmanagement2.py", line 42, in <module>
file1_columns[0]: file2_columns[1],
TypeError: 'dict_keys' object is not subscriptable
This is the code I have put together for this task. I tried using the type(data1[0].keys()) but that just says 'type' object is not subscriptable.
Really need some advice or ideas as I am completely stumped :)
import csv
# specify the file paths for the two CSV files
file1 = "D:\Documents\Python\iris\esr.csv"
file2 = "D:\Documents\Python\iris\iris.csv"
def define_columns(file_path):
with open(file_path, "r") as f:
reader = csv.reader(f)
# get the first row (column headers)
columns = next(reader)
return columns
# Define the keys of the two files
file1_keys = define_columns(file1)
file2_keys = define_columns(file2)
# read the data from the first CSV file
data1 = []
with open(file1, "r") as f:
reader = csv.DictReader(f)
for row in reader:
# Convert data to ASCII
for key in file1_keys:
row[key] = row[key].encode("ascii", "ignore").decode()
data1.append(row)
# read the data from the second CSV file
data2 = []
with open(file2, "r") as f:
reader = csv.DictReader(f)
for row in reader:
# Convert data to ASCII
for key in file2_keys:
row[key] = row[key].encode("ascii", "ignore").decode()
data2.append(row)
# Define the columns name of the two files
file1_columns = data1[0].keys()
file2_columns = data2[0].keys()
col_map = {
file1_columns[0]: file2_columns[1],
file1_columns[2]: file2_columns[5],
file1_columns[1]: file2_columns[6],
file1_columns[4]: file2_columns[7]
}
# match the data from the two files based on a specific column (e.g. "ID")
matching_data = []
non_matching_data1 = []
non_matching_data2 = []
for row1 in data1:
matched = False
for row2 in data2:
if row1[col_map[file1_columns[0]]] == row2[col_map[file2_columns[1]]]:
matching_data.append({**row1, **row2})
matched = True
break
if not matched:
non_matching_data1.append(row1)
for row2 in data2:
matched = False
for row1 in data1:
if row1[col_map[file1_columns[0]]] == row2[col_map[file2_columns[1]]]:
matched = True
break
if not matched:
non_matching_data2.append(row2)
# create a new CSV file with the matched data
with open("matched_data.csv", "w") as f:
writer = csv.DictWriter(f, fieldnames=list(col_map.keys())+list(col_map.values()))
writer.writeheader()
for row in matching_data:
writer.writerow(row)
# create a new CSV file with the non-matching data from file1
with open("non_matching_data1.csv", "w") as f:
writer = csv.DictWriter(f, fieldnames=list(data1[0].keys()))
writer.writeheader()
for row in non_matching_data1:
writer.writerow(row)
# create a new CSV file with the non-matching data from file2
with open("non_matching_data2.csv", "w") as f:
writer = csv.DictWriter(f, fieldnames=list(data2[0].keys()))
writer.writeheader()
for row in non_matching_data2:
writer.writerow(row)
Related
I have 2 CSVs which are New.csv and Old.csv shown below:
Old.csv
longName,shortName,eventType,number,severity
ACTAGENT201,ACAT201,RES,1,INFO
ACTAGENT202,ACAT202,RES,2,ALERT
ACODE801,AC801,ADMIN,1,MINOR
ACODE802,AC802,ADMIN,2,MINOR
ACODE102,AC102,COMM,2,CRITICAL
ACODE103,AC103,COMM,3,CRITICAL
ACODE104,AC104,COMM,4,CRITICAL
ACODE105,AC105,COMM,5,CRITICAL
ACODE106,AC106,COMM,6,CRITICAL
New.csv
longName,shortName,eventType,number,severity
ACTAGENT201,ACAT201,RES,1,INFO
ACTAGENT202,ACAT202,RES,2,ALERT
ACODE801,AC801,ADMIN,1,MINOR
ACODE802,AC802,ThisHasBeenChanged,2,MINOR
ACODE102,AC102,COMM,2,CRITICAL
ACODE103,AC103,COMM,3,CRITICAL
ACODE104,AC104,COMM,4,THISHASBEENCHANGED
ACODE105,AC105,COMM,5,CRITICAL
ACODE106,AC106,COMM,6,CRITICAL
If there is data in one of the columns in the row that has been modified/changed between the old.csv and the new.csv then that whole row should be appended to the changes.csv like this with each column from old.csv and new.csv beside each other:
I know how to find new and deleted items in the csv, but could not figure out how to get the modified items. Code below:
import csv
def DeletedItems(old_csv, new_csv, changes_csv):
with open(new_csv, newline="", encoding="utf8") as new_fp:
csv_reader = csv.reader(new_fp)
csv_headings = next(csv_reader)
new_long_names = {row[0] for row in csv.reader(new_fp)}
with open(old_csv, newline="", encoding="utf8") as old_fp:
with open(changes_csv, "a", newline="", encoding="utf8") as changes_fp:
writer = csv.writer(changes_fp)
writer.writerow("")
for row in csv.reader(old_fp):
if row[0] not in new_long_names:
writer.writerow(row)
def NewItems(old_csv, new_csv, changes_csv):
with open(old_csv, newline="", encoding="utf8") as old_fp:
csv_reader = csv.reader(old_fp)
csv_headings = next(csv_reader)
old_long_names = {row[0] for row in csv.reader(old_fp)}
with open(new_csv, newline="", encoding="utf8") as new_fp:
with open(changes_csv, "w", newline="", encoding="utf8") as changes_fp:
writer = csv.writer(changes_fp)
for row in csv.reader(new_fp):
if row[0] not in old_long_names:
writer.writerow(row)
NewItems("old.csv", "new.csv", "changes.csv")
DeletedItems("old.csv", "new.csv", "changes.csv")
First, read both CSV files into a dictionary, using the longName values as keys.
import csv
with open(old_csv_file, "r") as fh:
reader = csv.reader(fh)
old_csv = {row[0]: row for row in reader}
with open(new_csv_file, "r") as fh:
reader = csv.reader(fh)
new_csv = {row[0]: row for row in reader}
Then, it's easy to find newly added and deleted keys using set operations.
old_longNames = set(old_csv.keys())
new_longNames = set(new_csv.keys())
# common: set intersection
common_longNames = old_longNames.intersection(new_longNames)
# removed: whatever's in old but not in new
removed_longNames = old_longNames - new_longNames
# added: whatever's in new but not in old
added_longNames = new_longNames - old_longNames
Finally, iterate over the common set to find where there are changes:
changed_longNames = []
for key in common_longNames:
old_row = old_csv[key]
new_row = new_csv[key]
# if any(o != n for o, n in zip(old_row, new_row)):
if old_row != new_row:
# this row has at least one column changed. Do whatever
print(f"LongName {key} has changes")
changed_longNames.append(key)
Or, as a list comprehension:
changed_longNames = [key for key in common_longNames if old_csv[key] != new_csv[key]]
Writing everything to a new csv file is also fairly trivial. Note that the sets don't preserve the order, so you might not get the result in the same order.
with open("deleted.csv", "w") as fh:
writer = csv.writer(fh)
for key in removed_longNames:
writer.writerow(old_csv[key])
with open("inserted.csv", "w") as fh:
writer = csv.writer(fh)
for key in added_longNames:
writer.writerow(new_csv[key])
with open("changed.csv", "w") as fh:
writer = csv.writer(fh)
for key in changed_longNames:
old_row = old_csv[key]
new_row = new_csv[key]
merged_row = []
for oi, ni in zip(old_row, new_row):
merged_row.append(oi)
merged_row.append(ni)
writer.writerow(merged_row)
This is one file result.csv:
M11251TH1230
M11543TH4292
M11435TDS144
This is another file sample.csv:
M11435TDS144,STB#1,Router#1
M11543TH4292,STB#2,Router#1
M11509TD9937,STB#3,Router#1
M11543TH4258,STB#4,Router#1
Can I write a Python program to compare both the files and if line in result.csv matches with the first word in the line in sample.csv, then append 1 else append 0 at every line in sample.csv?
import pandas as pd
d1 = pd.read_csv("1.csv",names=["Type"])
d2 = pd.read_csv("2.csv",names=["Type","Col2","Col3"])
d2["Index"] = 0
for x in d1["Type"] :
d2["Index"][d2["Type"] == x] = 1
d2.to_csv("3.csv",header=False)
Considering "1.csv" and "2.csv" are your csv input files and "3.csv" is the result you needed
The solution using csv.reader and csv.writer (csv module):
import csv
newLines = []
# change the file path to the actual one
with open('./data/result.csv', newline='\n') as csvfile:
data = csv.reader(csvfile)
items = [''.join(line) for line in data]
with open('./data/sample.csv', newline='\n') as csvfile:
data = list(csv.reader(csvfile))
for line in data:
line.append(1 if line[0] in items else 0)
newLines.append(line)
with open('./data/sample.csv', 'w', newline='\n') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(newLines)
The sample.csv contents:
M11435TDS144,STB#1,Router#1,1
M11543TH4292,STB#2,Router#1,1
M11509TD9937,STB#3,Router#1,0
M11543TH4258,STB#4,Router#1,0
With only one column, I wonder why you made it as a result.csv. If it is not going to have any more columns, a simple file read operation would suffice. Along with converting the data from result.csv to dictionary will help in quick run as well.
result_file = "result.csv"
sample_file = "sample.csv"
with open(result_file) as fp:
result_data = fp.read()
result_dict = dict.fromkeys(result_data.split("\n"))
"""
You can change the above logic, in case you have very few fields on csv like this:
result_data = fp.readlines()
result_dict = {}
for result in result_data:
key, other_field = result.split(",", 1)
result_dict[key] = other_field.strip()
"""
#Since sample.csv is a real csv, using csv reader and writer
with open(sample_file, "rb") as fp:
sample_data = csv.reader(fp)
output_data = []
for data in sample_data:
output_data.append("%s,%d" % (data, data[0] in result_dict))
with open(sample_file, "wb") as fp:
data_writer = csv.writer(fp)
data_writer.writerows(output_data)
The following snippet of code will work for you
import csv
with open('result.csv', 'rb') as f:
reader = csv.reader(f)
result_list = []
for row in reader:
result_list.extend(row)
with open('sample.csv', 'rb') as f:
reader = csv.reader(f)
sample_list = []
for row in reader:
if row[0] in result_list:
sample_list.append(row + [1])
else:
sample_list.append(row + [0]
with open('sample.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(sample_list)
I would like to do the following
read a csv file, Add a new first column, then rename some of the columns
then load the records from csv file.
Ultimately, I would like the first column to be populated with the file
name.
I'm fairly new to Python and I've kind of worked out how to change the fieldnames however, loading the data is a problem as it's looking for the original fieldnames which no longer match.
Code snippet
import csv
import os
inputFileName = "manifest1.csv"
outputFileName = os.path.splitext(inputFileName)[0] + "_modified.csv"
with open(inputFileName, 'rb') as inFile, open(outputFileName, 'wb') as outfile:
r = csv.DictReader(inFile)
fieldnames = ['MapSvcName','ClientHostName', 'Databasetype', 'ID_A', 'KeepExistingData', 'KeepExistingMapCache', 'Name', 'OnPremisePath', 'Resourcestype']
w = csv.DictWriter(outfile,fieldnames)
w.writeheader()
*** Here is where I start to go wrong
# copy the rest
for node, row in enumerate(r,1):
w.writerow(dict(row))
Error
File "D:\Apps\Python27\ArcGIS10.3\lib\csv.py", line 148, in _dict_to_list
+ ", ".join([repr(x) for x in wrong_fields]))
ValueError: dict contains fields not in fieldnames: 'Databases [xsi:type]', 'Resources [xsi:type]', 'ID'
Would like to some assistance to not just learn but truly understand what I need to do.
Cheers and thanks
Peter
Update..
I think I've worked it out
import csv
import os
inputFileName = "manifest1.csv"
outputFileName = os.path.splitext(inputFileName)[0] + "_modified.csv"
with open(inputFileName, 'rb') as inFile, open(outputFileName, 'wb') as outfile:
r = csv.reader(inFile)
w = csv.writer(outfile)
header = next(r)
header.insert(0, 'MapSvcName')
#w.writerow(header)
next(r, None) # skip the first row from the reader, the old header
# write new header
w.writerow(['MapSvcName','ClientHostName', 'Databasetype', 'ID_A', 'KeepExistingData', 'KeepExistingMapCache', 'Name', 'OnPremisePath', 'Resourcestype'])
prevRow = next(r)
prevRow.insert(0, '0')
w.writerow(prevRow)
for row in r:
if prevRow[-1] == row[-1]:
val = '0'
else:
val = prevRow[-1]
row.insert(0,val)
prevRow = row
w.writerow(row)
I'm using this information (downloaded the file to my computer) http://www.genome.gov/admin/gwascatalog.txt
and wrote this
import csv
path = '/Users/mtchavez/Documents/ALS/Diseasome/GWAS.txt'
read_file = open(path)
reader = csv.reader(read_file, delimiter = '\t')
fieldnames = reader.next()
rows = list(reader)
read_file.close()
write_file = open('datatest.csv', 'wb')
writer = csv.writer(write_file, delimiter = '\t')
writer.writerow(('disease', 'genes'))
disease_gene = dict()
for row in rows:
disease = row[7]
reported_genes = row[13]
but I get an error message:
File "data1.py", line 18, in <module>
disease = row[7]
IndexError: list index out of range
There is an empty line at the end of this csv file and it will create an empty row. Delete the last line and the code works fine.
Try filtering for empty lines:
for row in rows:
if not row: continue
disease = row[7]
reported_genes = row[13]
Or more specifically, filter for the desired length:
for row in rows:
if len(row) != EXPECTED_LENGTH_OF_RECORD: continue
disease = row[7]
reported_genes = row[13]
Some example data:
title1|title2|title3|title4|merge
test|data|here|and
test|data|343|AND
",3|data|343|and
My attempt at coding this:
import csv
import StringIO
storedoutput = StringIO.StringIO()
fields = ('title1', 'title2', 'title3', 'title4', 'merge')
with open('file.csv', 'rb') as input_csv:
reader = csv.DictReader(input_csv, fields, delimiter='|')
for counter, row in enumerate(reader):
counter += 1
#print row
if counter != 1:
for field in fields:
if field == "merge":
row['merge'] = ("%s%s%s" % (row["title1"], row["title3"], row["title4"]))
print row
storedoutput.writelines(','.join(map(str, row)) + '\n')
contents = storedoutput.getvalue()
storedoutput.close()
print "".join(contents)
with open('file.csv', 'rb') as input_csv:
input_csv = input_csv.read().strip()
output_csv = []
output_csv.append(contents.strip())
if "".join(output_csv) != input_csv:
with open('file.csv', 'wb') as new_csv:
new_csv.write("".join(output_csv))
Output should be
title1|title2|title3|title4|merge
test|data|here|and|testhereand
test|data|343|AND|test343AND
",3|data|343|and|",3343and
For your reference upon running this code the first print it prints the rows as I would hope then to appear in the output csv. However the second print prints the title row x times where x is the number of rows.
Any input or corrections or working code would be appreciated.
I think we can make this a lot simpler. Dealing with the rogue " was a bit of a nuisance, I admit, because you have to work hard to tell Python you don't want to worry about it.
import csv
with open('file.csv', 'rb') as input_csv, open("new_file.csv", "wb") as output_csv:
reader = csv.DictReader(input_csv, delimiter='|', quoting=csv.QUOTE_NONE)
writer = csv.DictWriter(output_csv, reader.fieldnames, delimiter="|",quoting=csv.QUOTE_NONE, quotechar=None)
merge_cols = "title1", "title3", "title4"
writer.writeheader()
for row in reader:
row["merge"] = ''.join(row[col] for col in merge_cols)
writer.writerow(row)
produces
$ cat new_file.csv
title1|title2|title3|title4|merge
test|data|here|and|testhereand
test|data|343|AND|test343AND
",3|data|343|and|",3343and
Note that even though you wanted the original file updated, I refused. Why? It's a bad idea, because then you can destroy your data while working on it.
How can I be so sure? Because that's exactly what I did when I first ran your code, and I know better. ;^)
That double quote in the last line is definitely messing up the csv.DictReader().
This works:
new_lines = []
with open('file.csv', 'rb') as f:
# skip the first line
new_lines.append(f.next().strip())
for line in f:
# the newline and split the fields
line = line.strip().split('|')
# exctract the field data you want
title1, title3, title4 = line[0], line[2], line[3]
# turn the field data into a string and append in to the rest
line.append(''.join([title1, title3, title4]))
# save the new line for later
new_lines.append('|'.join(line))
with open('file.csv', 'w') as f:
# make one long string and write it to the new file
f.write('\n'.join(new_lines))
import csv
import StringIO
stored_output = StringIO.StringIO()
with open('file.csv', 'rb') as input_csv:
reader = csv.DictReader(input_csv, delimiter='|', quoting=csv.QUOTE_NONE)
writer = csv.DictWriter(stored_output, reader.fieldnames, delimiter="|",quoting=csv.QUOTE_NONE, quotechar=None)
merge_cols = "title1", "title3", "title4"
writer.writeheader()
for row in reader:
row["merge"] = ''.join(row[col] for col in merge_cols)
writer.writerow(row)
contents = stored_output.getvalue()
stored_output.close()
print contents
with open('file.csv', 'rb') as input_csv:
input_csv = input_csv.read().strip()
if input_csv != contents.strip():
with open('file.csv', 'wb') as new_csv:
new_csv.write("".join(contents))