Related
I have put the together code to compare and match data between two CSVs and collate that data into a new CSV. The CSVs have similar data but the column names and their positions are different.
When I Run and Debug, it throws the following error at line 42:
Exception has occurred: TypeError
'dict_keys' object is not subscriptable
File "D:\Documents\Python\iris\accountmanagement2.py", line 42, in <module>
file1_columns[0]: file2_columns[1],
TypeError: 'dict_keys' object is not subscriptable
This is the code I have put together for this task. I tried using the type(data1[0].keys()) but that just says 'type' object is not subscriptable.
Really need some advice or ideas as I am completely stumped :)
import csv
# specify the file paths for the two CSV files
file1 = "D:\Documents\Python\iris\esr.csv"
file2 = "D:\Documents\Python\iris\iris.csv"
def define_columns(file_path):
with open(file_path, "r") as f:
reader = csv.reader(f)
# get the first row (column headers)
columns = next(reader)
return columns
# Define the keys of the two files
file1_keys = define_columns(file1)
file2_keys = define_columns(file2)
# read the data from the first CSV file
data1 = []
with open(file1, "r") as f:
reader = csv.DictReader(f)
for row in reader:
# Convert data to ASCII
for key in file1_keys:
row[key] = row[key].encode("ascii", "ignore").decode()
data1.append(row)
# read the data from the second CSV file
data2 = []
with open(file2, "r") as f:
reader = csv.DictReader(f)
for row in reader:
# Convert data to ASCII
for key in file2_keys:
row[key] = row[key].encode("ascii", "ignore").decode()
data2.append(row)
# Define the columns name of the two files
file1_columns = data1[0].keys()
file2_columns = data2[0].keys()
col_map = {
file1_columns[0]: file2_columns[1],
file1_columns[2]: file2_columns[5],
file1_columns[1]: file2_columns[6],
file1_columns[4]: file2_columns[7]
}
# match the data from the two files based on a specific column (e.g. "ID")
matching_data = []
non_matching_data1 = []
non_matching_data2 = []
for row1 in data1:
matched = False
for row2 in data2:
if row1[col_map[file1_columns[0]]] == row2[col_map[file2_columns[1]]]:
matching_data.append({**row1, **row2})
matched = True
break
if not matched:
non_matching_data1.append(row1)
for row2 in data2:
matched = False
for row1 in data1:
if row1[col_map[file1_columns[0]]] == row2[col_map[file2_columns[1]]]:
matched = True
break
if not matched:
non_matching_data2.append(row2)
# create a new CSV file with the matched data
with open("matched_data.csv", "w") as f:
writer = csv.DictWriter(f, fieldnames=list(col_map.keys())+list(col_map.values()))
writer.writeheader()
for row in matching_data:
writer.writerow(row)
# create a new CSV file with the non-matching data from file1
with open("non_matching_data1.csv", "w") as f:
writer = csv.DictWriter(f, fieldnames=list(data1[0].keys()))
writer.writeheader()
for row in non_matching_data1:
writer.writerow(row)
# create a new CSV file with the non-matching data from file2
with open("non_matching_data2.csv", "w") as f:
writer = csv.DictWriter(f, fieldnames=list(data2[0].keys()))
writer.writeheader()
for row in non_matching_data2:
writer.writerow(row)
I have 2 CSVs which are New.csv and Old.csv shown below:
Old.csv
longName,shortName,eventType,number,severity
ACTAGENT201,ACAT201,RES,1,INFO
ACTAGENT202,ACAT202,RES,2,ALERT
ACODE801,AC801,ADMIN,1,MINOR
ACODE802,AC802,ADMIN,2,MINOR
ACODE102,AC102,COMM,2,CRITICAL
ACODE103,AC103,COMM,3,CRITICAL
ACODE104,AC104,COMM,4,CRITICAL
ACODE105,AC105,COMM,5,CRITICAL
ACODE106,AC106,COMM,6,CRITICAL
New.csv
longName,shortName,eventType,number,severity
ACTAGENT201,ACAT201,RES,1,INFO
ACTAGENT202,ACAT202,RES,2,ALERT
ACODE801,AC801,ADMIN,1,MINOR
ACODE802,AC802,ThisHasBeenChanged,2,MINOR
ACODE102,AC102,COMM,2,CRITICAL
ACODE103,AC103,COMM,3,CRITICAL
ACODE104,AC104,COMM,4,THISHASBEENCHANGED
ACODE105,AC105,COMM,5,CRITICAL
ACODE106,AC106,COMM,6,CRITICAL
If there is data in one of the columns in the row that has been modified/changed between the old.csv and the new.csv then that whole row should be appended to the changes.csv like this with each column from old.csv and new.csv beside each other:
I know how to find new and deleted items in the csv, but could not figure out how to get the modified items. Code below:
import csv
def DeletedItems(old_csv, new_csv, changes_csv):
with open(new_csv, newline="", encoding="utf8") as new_fp:
csv_reader = csv.reader(new_fp)
csv_headings = next(csv_reader)
new_long_names = {row[0] for row in csv.reader(new_fp)}
with open(old_csv, newline="", encoding="utf8") as old_fp:
with open(changes_csv, "a", newline="", encoding="utf8") as changes_fp:
writer = csv.writer(changes_fp)
writer.writerow("")
for row in csv.reader(old_fp):
if row[0] not in new_long_names:
writer.writerow(row)
def NewItems(old_csv, new_csv, changes_csv):
with open(old_csv, newline="", encoding="utf8") as old_fp:
csv_reader = csv.reader(old_fp)
csv_headings = next(csv_reader)
old_long_names = {row[0] for row in csv.reader(old_fp)}
with open(new_csv, newline="", encoding="utf8") as new_fp:
with open(changes_csv, "w", newline="", encoding="utf8") as changes_fp:
writer = csv.writer(changes_fp)
for row in csv.reader(new_fp):
if row[0] not in old_long_names:
writer.writerow(row)
NewItems("old.csv", "new.csv", "changes.csv")
DeletedItems("old.csv", "new.csv", "changes.csv")
First, read both CSV files into a dictionary, using the longName values as keys.
import csv
with open(old_csv_file, "r") as fh:
reader = csv.reader(fh)
old_csv = {row[0]: row for row in reader}
with open(new_csv_file, "r") as fh:
reader = csv.reader(fh)
new_csv = {row[0]: row for row in reader}
Then, it's easy to find newly added and deleted keys using set operations.
old_longNames = set(old_csv.keys())
new_longNames = set(new_csv.keys())
# common: set intersection
common_longNames = old_longNames.intersection(new_longNames)
# removed: whatever's in old but not in new
removed_longNames = old_longNames - new_longNames
# added: whatever's in new but not in old
added_longNames = new_longNames - old_longNames
Finally, iterate over the common set to find where there are changes:
changed_longNames = []
for key in common_longNames:
old_row = old_csv[key]
new_row = new_csv[key]
# if any(o != n for o, n in zip(old_row, new_row)):
if old_row != new_row:
# this row has at least one column changed. Do whatever
print(f"LongName {key} has changes")
changed_longNames.append(key)
Or, as a list comprehension:
changed_longNames = [key for key in common_longNames if old_csv[key] != new_csv[key]]
Writing everything to a new csv file is also fairly trivial. Note that the sets don't preserve the order, so you might not get the result in the same order.
with open("deleted.csv", "w") as fh:
writer = csv.writer(fh)
for key in removed_longNames:
writer.writerow(old_csv[key])
with open("inserted.csv", "w") as fh:
writer = csv.writer(fh)
for key in added_longNames:
writer.writerow(new_csv[key])
with open("changed.csv", "w") as fh:
writer = csv.writer(fh)
for key in changed_longNames:
old_row = old_csv[key]
new_row = new_csv[key]
merged_row = []
for oi, ni in zip(old_row, new_row):
merged_row.append(oi)
merged_row.append(ni)
writer.writerow(merged_row)
I am Trying to insert one column on an existing CSV file name as test.csv, on column number E.
If the column E is already occupied, i Need to shift that column to right and insert new.
Column Head should be Day, columns should be filled with current date
Cunnernt Data
Name Age location school
Adam 12 abc xyz
eve 14 abc xyz
Joy 12 abc xyz
Need Out
Name Age location school Day
Adam 12 abc xyz =today()
eve 14 abc xyz =today()
Joy 12 abc xyz =today()
I will have normally 2000 rows
I tried the following code that didn't work for me
import csv
Path = 'C:\\Users\\saquib.khan\\Desktop\\Profile_All\\Demo\\New'
infilename = Path + '\\Test.csv'
outfilename = Path + '\\Out1.csv'
with open(infilename, 'rb') as fp_in, open(outfilename, 'wb') as fp_out:
reader = csv.reader(fp_in, delimiter=",")
writer = csv.writer(fp_out, delimiter=",")
headers = next(reader) # read title row
headers[E:E] = ['New Label']
writer.writerow(headers)
for row in reader:
row[E:E] = [0]
writer.writerow(row)
You are opening the 2 files in b mode (bytes instead of text mode).
Is there a reason why you are doing it this way?
Have you tried to open them in text mode?
with open(infilename, 'r') as fp_in, open(outfilename, 'w') as fp_out:
Does it work in this case?
You can try this:
import csv
Path = 'C:\\Users\\saquib.khan\\Desktop\\Profile_All\\Demo\\New'
infilename = Path + '\\Test.csv'
outfilename = Path + '\\Out1.csv'
with open(infilename, 'rb') as fp_in, open(outfilename, 'wb') as fp_out:
reader = csv.reader(fp_in, delimiter=",")
writer = csv.writer(fp_out, delimiter=",")
headers = next(reader) # read title row
idx = ord('E') - ord('A')
headers[idx:idx] = ['New Label']
writer.writerow(headers)
for row in reader:
row[idx:idx] = [0]
writer.writerow(row)
Convert the column name to the column index to operate with the Python list.
Add Date
now = datetime.datetime.now()
now -= datetime.timedelta(days=1)
dt = now.strftime("%m/%d/20%y")
F = 'in.csv'
df = pd.read_csv(Path1 + F)
df["Day"]=df.shape[0]*[dt]
df.to_csv(Path +"out.csv")
I have the following input file with a header row:
test_in.csv
LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
-72.5708234,41.4155142,39,HICKORY LA,,,,,,,8a0df668e0d49b02
-72.5647745,41.4160301,1213,KILLINGWORTH RD,,,,,,,b3ecaab86e476f46
I need to replace any of the columns with a specified string
for example CITY column's data should be replaced from "" to "MyCity"
My code only outputs the header and first row
python test_forcefld.py test_in.csv MyCity CITY out_test.csv
import csv
import sys
in_file_name = sys.argv[1]
force_data = sys.argv[2]
force_fld = sys.argv[3]
out_file_name = sys.argv[4]
# First read top row/header from input file
fieldnames = []
for filename in [in_file_name]:
with open(filename, "rb") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
fieldnames.append(h)
#print headers to output file
with open(out_file_name, 'w') as fou:
dw = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw.writeheader()
f_in2 = open(in_file_name, "rb")
reader2 = csv.DictReader(f_in2) # Uses the field names in this file
datarow = next(reader2)
datarow[force_fld] = force_data
with open(out_file_name, 'wa') as fou:
dw2 = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw2.writeheader()
dw2.writerow(data row)
Output shows
LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
-72.5708234,41.4155142,39,HICKORY LA,,MyCity,,,,,8a0df668e0d49b02
Your code is a little difficult to read, but assuming datarow is a dictionary containing your records:
In your last row, change
dw2.writerow(datarow)
Into
dw2.writerows(datarow)
While you're at it, you should also consider using datarow.keys() for your fieldnames, for conciseness.
This should do it, you just need pandas:
import pandas as pd
df = pd.read_csv(in_file_name, sep=',')
df['CITY'].fillna('MyCity', inplace=True)
And to save it:
df.to_csv(out_file_name)
You can try somthing like this in order to have your desired file:
I'm assuming your input file is called f_input.txt and your output file is called f_output.txt:
data = list(k.rstrip().split(',') for k in open("f_input.txt", 'r'))
with open("f_output.txt", 'a+') as f:
f.write(",".join(data[0]) + '\n')
for k in data[1:]:
# Modify the positions of k[:n] + your data + k[n+1]
# if you need to handle another position
f.write(",".join(k[:6]) + "MyCity" + ",".join(k[7:]) + "\n")
This worked in the end:
import csv
import sys
in_file_name = sys.argv[1]
force_data = sys.argv[2]
force_fld = sys.argv[3]
out_file_name = sys.argv[4]
# First read top row/header from input file
fieldnames = []
for filename in [in_file_name]:
with open(filename, "rb") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
fieldnames.append(h)
f_in2 = open(in_file_name, "r")
#print headers to output file
fou = open(out_file_name, 'wa')
dw = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw.writeheader()
reader2 = csv.DictReader(f_in2) # Uses the field names in this file
for row in reader2:
row[force_fld] = force_data
dw2 = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw2.writerow(row)
I need to get information from a list and add a column year from name. I still not sure how to add one field 'year' in record. Can I use append?
And about output file, I just need use outputcsv.writerow(records) isn't it?
This is a part of code that I stuck:
filenames = ('babyQld2010.csv',
'babyQld2011.csv',
'babyQld2012.csv',
'babyQld2012.csv',
'babyQld2014.csv')
outFile = open('babyQldAll.csv','w')
csvFile_out = csv.writer(outFile, delimiter=',')
for filename in filenames:
name, ext = filename.split('.')
year = name[-4:] #extract year from file names
records = extract_names(filename)
# Get (name, count, gender) from list "records",
# and add value of "year" and write into output file (using "for" loop )
Output file look like:
2010,Lola,69,Girl
And input, I have 5 file babyQld2010.csv, babyQld2011.csv, babyQld2012.csv, babyQld2012.csv, babyQld2014.csv which contains:
Mia,425,William,493
and I have to sort it in format and I already done it and save in list 'records'
Lola,69,Girl
now I need to add one field 'year' on 'record' list and export csv file.
This is my full code:
import csv
def extract_names(filename):
''' Extract babyname, count, gender from a csv file,
and return the data in a list.
'''
inFile = open(filename, 'rU')
csvFile = csv.reader(inFile, delimiter=',')
# Initialization
records = []
rowNum = 0
for row in csvFile:
if rowNum != 0:
# +++++ You code here ++++
# Read each row of csv file and save information in list 'records'
# as (name, count, gender)
records.append([row[0], row[1], "Female"])
records.append([row[2], row[3], "Male"])
print('Process each row...')
rowNum += 1
inFile.close()
return(records)
#### Start main program #####
filenames = ('babyQld2010.csv',
'babyQld2011.csv',
'babyQld2012.csv',
'babyQld2012.csv',
'babyQld2014.csv')
with open('babyQldAll.csv','w') as outFile:
csvFile_out = csv.writer(outFile, delimiter=',')
for filename in filenames:
name, ext = filename.split('.')
year = name.split('.')[0][-4:] #extract year from file names
records = extract_names(filename)
for record in records:
csvFile_out.write([year] + record)
print("Write in csv file...")
outFile.close()
To get the year from the csv file you can simply split the string at '.' and then take the last four characters from the first part of the split. Example -
>>> s = 'babyQld2010.csv'
>>> s.split('.')[0][-4:]
'2010'
Then just simply iterate over your list of records, which you say is correct, for each list within in, use list contatenation to create a new list with year at the start and write that to csv file.
I would also suggest that you use with statement for opening the file to write to (and even in the function where you are reading from the other csv files). Example -
filenames = ('babyQld2010.csv',
'babyQld2011.csv',
'babyQld2012.csv',
'babyQld2012.csv',
'babyQld2014.csv')
with open('babyQldAll.csv','w') as outFile:
csvFile_out = csv.writer(outFile, delimiter=',')
for filename in filenames:
name, ext = filename.split('.')
year = name.split('.')[0][-4:] #extract year from file names
records = extract_names(filename)
for record in records:
csvFile_out.writerow([year] + record)
Yes, you can just append the year column to each row as you read it in from your source files. You can read in & write out each row as a dictionary so that you can use your existing column headers to address the data if you need to massage it on the way through.
Using the csv.DictWriter() method you specify your headers (fieldnames) when you set it up. You can then write them out with the writeheader() method.
import csv
file_list = ['babyQld2010.csv',
'babyQld2011.csv',
'babyQld2012.csv',
'babyQld2012.csv',
'babyQld2014.csv']
outFile = open('babyQldAll.csv', 'wb')
csv_writer = csv.DictWriter(outFile,
fieldnames=['name','count','gender','year'])
csv_write_out.writeheader()
for a_file in file_list:
name,ext = a_file.split('.')
year = name[-4:]
with open(a_file, 'rb') as inFile:
csv_read_in = csv.DictReader(inFile)
for row in csv_read_in:
row['year'] = year
csv_writer.writerow(row)
outfile.close()
Hope this helps.