Related
I have 2 CSVs which are New.csv and Old.csv shown below:
Old.csv
longName,shortName,eventType,number,severity
ACTAGENT201,ACAT201,RES,1,INFO
ACTAGENT202,ACAT202,RES,2,ALERT
ACODE801,AC801,ADMIN,1,MINOR
ACODE802,AC802,ADMIN,2,MINOR
ACODE102,AC102,COMM,2,CRITICAL
ACODE103,AC103,COMM,3,CRITICAL
ACODE104,AC104,COMM,4,CRITICAL
ACODE105,AC105,COMM,5,CRITICAL
ACODE106,AC106,COMM,6,CRITICAL
New.csv
longName,shortName,eventType,number,severity
ACTAGENT201,ACAT201,RES,1,INFO
ACTAGENT202,ACAT202,RES,2,ALERT
ACODE801,AC801,ADMIN,1,MINOR
ACODE802,AC802,ThisHasBeenChanged,2,MINOR
ACODE102,AC102,COMM,2,CRITICAL
ACODE103,AC103,COMM,3,CRITICAL
ACODE104,AC104,COMM,4,THISHASBEENCHANGED
ACODE105,AC105,COMM,5,CRITICAL
ACODE106,AC106,COMM,6,CRITICAL
If there is data in one of the columns in the row that has been modified/changed between the old.csv and the new.csv then that whole row should be appended to the changes.csv like this with each column from old.csv and new.csv beside each other:
I know how to find new and deleted items in the csv, but could not figure out how to get the modified items. Code below:
import csv
def DeletedItems(old_csv, new_csv, changes_csv):
with open(new_csv, newline="", encoding="utf8") as new_fp:
csv_reader = csv.reader(new_fp)
csv_headings = next(csv_reader)
new_long_names = {row[0] for row in csv.reader(new_fp)}
with open(old_csv, newline="", encoding="utf8") as old_fp:
with open(changes_csv, "a", newline="", encoding="utf8") as changes_fp:
writer = csv.writer(changes_fp)
writer.writerow("")
for row in csv.reader(old_fp):
if row[0] not in new_long_names:
writer.writerow(row)
def NewItems(old_csv, new_csv, changes_csv):
with open(old_csv, newline="", encoding="utf8") as old_fp:
csv_reader = csv.reader(old_fp)
csv_headings = next(csv_reader)
old_long_names = {row[0] for row in csv.reader(old_fp)}
with open(new_csv, newline="", encoding="utf8") as new_fp:
with open(changes_csv, "w", newline="", encoding="utf8") as changes_fp:
writer = csv.writer(changes_fp)
for row in csv.reader(new_fp):
if row[0] not in old_long_names:
writer.writerow(row)
NewItems("old.csv", "new.csv", "changes.csv")
DeletedItems("old.csv", "new.csv", "changes.csv")
First, read both CSV files into a dictionary, using the longName values as keys.
import csv
with open(old_csv_file, "r") as fh:
reader = csv.reader(fh)
old_csv = {row[0]: row for row in reader}
with open(new_csv_file, "r") as fh:
reader = csv.reader(fh)
new_csv = {row[0]: row for row in reader}
Then, it's easy to find newly added and deleted keys using set operations.
old_longNames = set(old_csv.keys())
new_longNames = set(new_csv.keys())
# common: set intersection
common_longNames = old_longNames.intersection(new_longNames)
# removed: whatever's in old but not in new
removed_longNames = old_longNames - new_longNames
# added: whatever's in new but not in old
added_longNames = new_longNames - old_longNames
Finally, iterate over the common set to find where there are changes:
changed_longNames = []
for key in common_longNames:
old_row = old_csv[key]
new_row = new_csv[key]
# if any(o != n for o, n in zip(old_row, new_row)):
if old_row != new_row:
# this row has at least one column changed. Do whatever
print(f"LongName {key} has changes")
changed_longNames.append(key)
Or, as a list comprehension:
changed_longNames = [key for key in common_longNames if old_csv[key] != new_csv[key]]
Writing everything to a new csv file is also fairly trivial. Note that the sets don't preserve the order, so you might not get the result in the same order.
with open("deleted.csv", "w") as fh:
writer = csv.writer(fh)
for key in removed_longNames:
writer.writerow(old_csv[key])
with open("inserted.csv", "w") as fh:
writer = csv.writer(fh)
for key in added_longNames:
writer.writerow(new_csv[key])
with open("changed.csv", "w") as fh:
writer = csv.writer(fh)
for key in changed_longNames:
old_row = old_csv[key]
new_row = new_csv[key]
merged_row = []
for oi, ni in zip(old_row, new_row):
merged_row.append(oi)
merged_row.append(ni)
writer.writerow(merged_row)
This is one file result.csv:
M11251TH1230
M11543TH4292
M11435TDS144
This is another file sample.csv:
M11435TDS144,STB#1,Router#1
M11543TH4292,STB#2,Router#1
M11509TD9937,STB#3,Router#1
M11543TH4258,STB#4,Router#1
Can I write a Python program to compare both the files and if line in result.csv matches with the first word in the line in sample.csv, then append 1 else append 0 at every line in sample.csv?
import pandas as pd
d1 = pd.read_csv("1.csv",names=["Type"])
d2 = pd.read_csv("2.csv",names=["Type","Col2","Col3"])
d2["Index"] = 0
for x in d1["Type"] :
d2["Index"][d2["Type"] == x] = 1
d2.to_csv("3.csv",header=False)
Considering "1.csv" and "2.csv" are your csv input files and "3.csv" is the result you needed
The solution using csv.reader and csv.writer (csv module):
import csv
newLines = []
# change the file path to the actual one
with open('./data/result.csv', newline='\n') as csvfile:
data = csv.reader(csvfile)
items = [''.join(line) for line in data]
with open('./data/sample.csv', newline='\n') as csvfile:
data = list(csv.reader(csvfile))
for line in data:
line.append(1 if line[0] in items else 0)
newLines.append(line)
with open('./data/sample.csv', 'w', newline='\n') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(newLines)
The sample.csv contents:
M11435TDS144,STB#1,Router#1,1
M11543TH4292,STB#2,Router#1,1
M11509TD9937,STB#3,Router#1,0
M11543TH4258,STB#4,Router#1,0
With only one column, I wonder why you made it as a result.csv. If it is not going to have any more columns, a simple file read operation would suffice. Along with converting the data from result.csv to dictionary will help in quick run as well.
result_file = "result.csv"
sample_file = "sample.csv"
with open(result_file) as fp:
result_data = fp.read()
result_dict = dict.fromkeys(result_data.split("\n"))
"""
You can change the above logic, in case you have very few fields on csv like this:
result_data = fp.readlines()
result_dict = {}
for result in result_data:
key, other_field = result.split(",", 1)
result_dict[key] = other_field.strip()
"""
#Since sample.csv is a real csv, using csv reader and writer
with open(sample_file, "rb") as fp:
sample_data = csv.reader(fp)
output_data = []
for data in sample_data:
output_data.append("%s,%d" % (data, data[0] in result_dict))
with open(sample_file, "wb") as fp:
data_writer = csv.writer(fp)
data_writer.writerows(output_data)
The following snippet of code will work for you
import csv
with open('result.csv', 'rb') as f:
reader = csv.reader(f)
result_list = []
for row in reader:
result_list.extend(row)
with open('sample.csv', 'rb') as f:
reader = csv.reader(f)
sample_list = []
for row in reader:
if row[0] in result_list:
sample_list.append(row + [1])
else:
sample_list.append(row + [0]
with open('sample.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(sample_list)
http://example.com/item/all-atv-quad.html,David,"Punjab",+123456789123
http://example.com/item/70cc-2014.html,Qubee,"Capital",+987654321987
http://example.com/item/quad-bike-zenith.html,Zenith,"UP",+123456789123
I have this test.csv where I have scraped a few items from certain site but the thing is "number" field has redundancy. So I somehow need to remove a row that has the same number as before. This is just the example file, In the real file some numbers are repeated more than 50+ times.
import csv
with open('test.csv', newline='') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',')
for column in csvreader:
"Some logic here"
if (column[3] == "+123456789123"):
print (column[0])
"or here"
I need reformated csv like this:
http://example.com/item/all-atv-quad.html,David,"Punjab",+123456789123
http://example.com/item/70cc-2014.html,Qubee,"Capital",+987654321987
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
def direct():
seen = set()
with open("test.csv") as infile, open("formatted.csv", 'w') as outfile:
for line in infile:
parts = line.rstrip().split(',')
number = parts[-1]
if number not in seen:
seen.add(number)
outfile.write(line)
def using_pandas():
"""Alternatively, use Pandas"""
df = pd.read_csv("test.csv", header=None)
df = df.drop_duplicates(subset=[3])
df.to_csv("formatted_pandas.csv", index=None, header=None)
def main():
direct()
using_pandas()
if __name__ == "__main__":
main()
This would filter out duplicates:
seen = set()
for line in csvreader:
if line[3] in seen:
continue
seen.add(line[3])
# write line to output file
And the csv read and write logic:
with open('test.csv') as fobj_in, open('test_clean.csv', 'w') as fobj_out:
csv_reader = csv.reader(fobj_in, delimiter=',')
csv_writer = csv.writer(fobj_out, delimiter=',')
seen = set()
for line in csvreader:
if line[3] in seen:
continue
seen.add(line[3])
csv_writer.writerow(line)
Some example data:
title1|title2|title3|title4|merge
test|data|here|and
test|data|343|AND
",3|data|343|and
My attempt at coding this:
import csv
import StringIO
storedoutput = StringIO.StringIO()
fields = ('title1', 'title2', 'title3', 'title4', 'merge')
with open('file.csv', 'rb') as input_csv:
reader = csv.DictReader(input_csv, fields, delimiter='|')
for counter, row in enumerate(reader):
counter += 1
#print row
if counter != 1:
for field in fields:
if field == "merge":
row['merge'] = ("%s%s%s" % (row["title1"], row["title3"], row["title4"]))
print row
storedoutput.writelines(','.join(map(str, row)) + '\n')
contents = storedoutput.getvalue()
storedoutput.close()
print "".join(contents)
with open('file.csv', 'rb') as input_csv:
input_csv = input_csv.read().strip()
output_csv = []
output_csv.append(contents.strip())
if "".join(output_csv) != input_csv:
with open('file.csv', 'wb') as new_csv:
new_csv.write("".join(output_csv))
Output should be
title1|title2|title3|title4|merge
test|data|here|and|testhereand
test|data|343|AND|test343AND
",3|data|343|and|",3343and
For your reference upon running this code the first print it prints the rows as I would hope then to appear in the output csv. However the second print prints the title row x times where x is the number of rows.
Any input or corrections or working code would be appreciated.
I think we can make this a lot simpler. Dealing with the rogue " was a bit of a nuisance, I admit, because you have to work hard to tell Python you don't want to worry about it.
import csv
with open('file.csv', 'rb') as input_csv, open("new_file.csv", "wb") as output_csv:
reader = csv.DictReader(input_csv, delimiter='|', quoting=csv.QUOTE_NONE)
writer = csv.DictWriter(output_csv, reader.fieldnames, delimiter="|",quoting=csv.QUOTE_NONE, quotechar=None)
merge_cols = "title1", "title3", "title4"
writer.writeheader()
for row in reader:
row["merge"] = ''.join(row[col] for col in merge_cols)
writer.writerow(row)
produces
$ cat new_file.csv
title1|title2|title3|title4|merge
test|data|here|and|testhereand
test|data|343|AND|test343AND
",3|data|343|and|",3343and
Note that even though you wanted the original file updated, I refused. Why? It's a bad idea, because then you can destroy your data while working on it.
How can I be so sure? Because that's exactly what I did when I first ran your code, and I know better. ;^)
That double quote in the last line is definitely messing up the csv.DictReader().
This works:
new_lines = []
with open('file.csv', 'rb') as f:
# skip the first line
new_lines.append(f.next().strip())
for line in f:
# the newline and split the fields
line = line.strip().split('|')
# exctract the field data you want
title1, title3, title4 = line[0], line[2], line[3]
# turn the field data into a string and append in to the rest
line.append(''.join([title1, title3, title4]))
# save the new line for later
new_lines.append('|'.join(line))
with open('file.csv', 'w') as f:
# make one long string and write it to the new file
f.write('\n'.join(new_lines))
import csv
import StringIO
stored_output = StringIO.StringIO()
with open('file.csv', 'rb') as input_csv:
reader = csv.DictReader(input_csv, delimiter='|', quoting=csv.QUOTE_NONE)
writer = csv.DictWriter(stored_output, reader.fieldnames, delimiter="|",quoting=csv.QUOTE_NONE, quotechar=None)
merge_cols = "title1", "title3", "title4"
writer.writeheader()
for row in reader:
row["merge"] = ''.join(row[col] for col in merge_cols)
writer.writerow(row)
contents = stored_output.getvalue()
stored_output.close()
print contents
with open('file.csv', 'rb') as input_csv:
input_csv = input_csv.read().strip()
if input_csv != contents.strip():
with open('file.csv', 'wb') as new_csv:
new_csv.write("".join(contents))
I'm trying to iterate over a CSV file that has a 'master list' of names, and compare it to another CSV file that contains only the names of people who were present and made phone calls.
I'm trying to iterate over the master list and compare it to the names in the other CSV file, take the number of calls made by the person and write a new CSV file containing number of Calls if the name isn't found or if it's 0, I need that column to have 0 there.
I'm not sure if its something incredibly simple I'm overlooking, or if I am truly going about this incorrectly.
Edited for formatting.
import csv
import sys
masterlst = open('masterlist.csv')
comparelst = open(sys.argv[1])
masterrdr = csv.DictReader(masterlst, dialect='excel')
comparerdr = csv.DictReader(comparelst, dialect='excel')
headers = comparerdr.fieldnames
with open('callcounts.csv', 'w') as outfile:
wrtr = csv.DictWriter(outfile, fieldnames=headers, dialect='excel', quoting=csv.QUOTE_MINIMAL, delimiter=',', escapechar='\n')
wrtr.writerow(dict((fn,fn) for fn in headers))
for lines in masterrdr:
for row in comparerdr:
if lines['Names'] == row['Names']:
print(lines['Names'] + ' has ' + row['Calls'] + ' calls')
wrtr.writerow(row)
elif lines['Names'] != row['Names']:
row['Calls'] = ('%s' % 0)
wrtr.writerow(row)
print(row['Names'] + ' had 0 calls')
masterlst.close()
comparelst.close()
Here's how I'd do it, assuming the file sizes do not prove to be problematic:
import csv
import sys
with open(sys.argv[1]) as comparelst:
comparerdr = csv.DictReader(comparelst, dialect='excel')
headers = comparerdr.fieldnames
names_and_counts = {}
for line in comparerdr:
names_and_counts[line['Names']] = line['Calls']
# or, if you're sure you only want the ones with 0 calls, just use a set and only add the line['Names'] values that that line['Calls'] == '0'
with open('masterlist.csv') as masterlst:
masterrdr = csv.DictReader(masterlst, dialect='excel')
with open('callcounts.csv', 'w') as outfile:
wrtr = csv.DictWriter(outfile, fieldnames=headers, dialect='excel', quoting=csv.QUOTE_MINIMAL, delimiter=',', escapechar='\n')
wrtr.writerow(dict((fn,fn) for fn in headers))
# or if you're on 2.7, wrtr.writeheader()
for line in masterrdr:
if names_and_counts.get(line['Names']) == '0':
row = {'Names': line['Names'], 'Calls': '0'}
wrtr.writerow(row)
That writes just the rows with 0 calls, which is what your text description said - you could tweak it if you wanted to write something else for non-0 calls.
Thanks everyone for the help. I was able to nest another with statement inside of my outer loop, and add a variable to test whether or not the name from the master list was found in the compare list. This is my final working code.
import csv
import sys
masterlst = open('masterlist.csv')
comparelst = open(sys.argv[1])
masterrdr = csv.DictReader(masterlst, dialect='excel')
comparerdr = csv.DictReader(comparelst, dialect='excel')
headers = comparerdr.fieldnames
with open('callcounts.csv', 'w') as outfile:
wrtr = csv.DictWriter(outfile, fieldnames=headers, dialect='excel', quoting=csv.QUOTE_MINIMAL, delimiter=',', escapechar='\n')
wrtr.writerow(dict((fn,fn) for fn in headers))
for line in masterrdr:
found = False
with open(sys.argv[1]) as loopfile:
looprdr = csv.DictReader(loopfile, dialect='excel')
for row in looprdr:
if row['Names'] == line['Names']:
line['Calls'] = row['Calls']
wrtr.writerow(line)
found = True
break
if found == False:
line['Calls'] = '0'
wrtr.writerow(line)
masterlst.close()
comparelst.close()