Python file matching and appending - python

This is one file result.csv:
M11251TH1230
M11543TH4292
M11435TDS144
This is another file sample.csv:
M11435TDS144,STB#1,Router#1
M11543TH4292,STB#2,Router#1
M11509TD9937,STB#3,Router#1
M11543TH4258,STB#4,Router#1
Can I write a Python program to compare both the files and if line in result.csv matches with the first word in the line in sample.csv, then append 1 else append 0 at every line in sample.csv?

import pandas as pd
d1 = pd.read_csv("1.csv",names=["Type"])
d2 = pd.read_csv("2.csv",names=["Type","Col2","Col3"])
d2["Index"] = 0
for x in d1["Type"] :
d2["Index"][d2["Type"] == x] = 1
d2.to_csv("3.csv",header=False)
Considering "1.csv" and "2.csv" are your csv input files and "3.csv" is the result you needed

The solution using csv.reader and csv.writer (csv module):
import csv
newLines = []
# change the file path to the actual one
with open('./data/result.csv', newline='\n') as csvfile:
data = csv.reader(csvfile)
items = [''.join(line) for line in data]
with open('./data/sample.csv', newline='\n') as csvfile:
data = list(csv.reader(csvfile))
for line in data:
line.append(1 if line[0] in items else 0)
newLines.append(line)
with open('./data/sample.csv', 'w', newline='\n') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(newLines)
The sample.csv contents:
M11435TDS144,STB#1,Router#1,1
M11543TH4292,STB#2,Router#1,1
M11509TD9937,STB#3,Router#1,0
M11543TH4258,STB#4,Router#1,0

With only one column, I wonder why you made it as a result.csv. If it is not going to have any more columns, a simple file read operation would suffice. Along with converting the data from result.csv to dictionary will help in quick run as well.
result_file = "result.csv"
sample_file = "sample.csv"
with open(result_file) as fp:
result_data = fp.read()
result_dict = dict.fromkeys(result_data.split("\n"))
"""
You can change the above logic, in case you have very few fields on csv like this:
result_data = fp.readlines()
result_dict = {}
for result in result_data:
key, other_field = result.split(",", 1)
result_dict[key] = other_field.strip()
"""
#Since sample.csv is a real csv, using csv reader and writer
with open(sample_file, "rb") as fp:
sample_data = csv.reader(fp)
output_data = []
for data in sample_data:
output_data.append("%s,%d" % (data, data[0] in result_dict))
with open(sample_file, "wb") as fp:
data_writer = csv.writer(fp)
data_writer.writerows(output_data)

The following snippet of code will work for you
import csv
with open('result.csv', 'rb') as f:
reader = csv.reader(f)
result_list = []
for row in reader:
result_list.extend(row)
with open('sample.csv', 'rb') as f:
reader = csv.reader(f)
sample_list = []
for row in reader:
if row[0] in result_list:
sample_list.append(row + [1])
else:
sample_list.append(row + [0]
with open('sample.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(sample_list)

Related

Python csv - replace any columns with specified value

I have the following input file with a header row:
test_in.csv
LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
-72.5708234,41.4155142,39,HICKORY LA,,,,,,,8a0df668e0d49b02
-72.5647745,41.4160301,1213,KILLINGWORTH RD,,,,,,,b3ecaab86e476f46
I need to replace any of the columns with a specified string
for example CITY column's data should be replaced from "" to "MyCity"
My code only outputs the header and first row
python test_forcefld.py test_in.csv MyCity CITY out_test.csv
import csv
import sys
in_file_name = sys.argv[1]
force_data = sys.argv[2]
force_fld = sys.argv[3]
out_file_name = sys.argv[4]
# First read top row/header from input file
fieldnames = []
for filename in [in_file_name]:
with open(filename, "rb") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
fieldnames.append(h)
#print headers to output file
with open(out_file_name, 'w') as fou:
dw = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw.writeheader()
f_in2 = open(in_file_name, "rb")
reader2 = csv.DictReader(f_in2) # Uses the field names in this file
datarow = next(reader2)
datarow[force_fld] = force_data
with open(out_file_name, 'wa') as fou:
dw2 = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw2.writeheader()
dw2.writerow(data row)
Output shows
LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
-72.5708234,41.4155142,39,HICKORY LA,,MyCity,,,,,8a0df668e0d49b02
Your code is a little difficult to read, but assuming datarow is a dictionary containing your records:
In your last row, change
dw2.writerow(datarow)
Into
dw2.writerows(datarow)
While you're at it, you should also consider using datarow.keys() for your fieldnames, for conciseness.
This should do it, you just need pandas:
import pandas as pd
df = pd.read_csv(in_file_name, sep=',')
df['CITY'].fillna('MyCity', inplace=True)
And to save it:
df.to_csv(out_file_name)
You can try somthing like this in order to have your desired file:
I'm assuming your input file is called f_input.txt and your output file is called f_output.txt:
data = list(k.rstrip().split(',') for k in open("f_input.txt", 'r'))
with open("f_output.txt", 'a+') as f:
f.write(",".join(data[0]) + '\n')
for k in data[1:]:
# Modify the positions of k[:n] + your data + k[n+1]
# if you need to handle another position
f.write(",".join(k[:6]) + "MyCity" + ",".join(k[7:]) + "\n")
This worked in the end:
import csv
import sys
in_file_name = sys.argv[1]
force_data = sys.argv[2]
force_fld = sys.argv[3]
out_file_name = sys.argv[4]
# First read top row/header from input file
fieldnames = []
for filename in [in_file_name]:
with open(filename, "rb") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
fieldnames.append(h)
f_in2 = open(in_file_name, "r")
#print headers to output file
fou = open(out_file_name, 'wa')
dw = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw.writeheader()
reader2 = csv.DictReader(f_in2) # Uses the field names in this file
for row in reader2:
row[force_fld] = force_data
dw2 = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw2.writerow(row)

Writing to a file after parsing

I have written a small python code where it will read a sample csv file and copy its first column to a temp csv file. Now when I try to compare that temporary file with another text file and try to write result to another file called result file, The file is created but with empty content.
But when i tested it in chunks, It is working fine
import csv
f = open("sample.csv", "r")
reader = csv.reader(f)
data = open("temp1.csv", "w")
w = csv.writer(data)
for row in reader:
my_row = []
my_row.append(row[0])
w.writerow(my_row)
with open('temp1.csv', 'r') as file1:
with open('serialNumber.txt', 'r') as file2:
same = set(file1).intersection(file2)
with open('result.txt', 'w') as file_out:
for line in same:
file_out.write(line)
print line
sample.csv
M11435TDS144,STB#1,Router#1
M11543TH4292,STB#2,Router#1
M11509TD9937,STB#3,Router#1
M11543TH4258,STB#4,Router#1
serialNumber.txt
G1A114042400571
M11543TH4258
M11251TH1230
M11435TDS144
M11543TH4292
M11509TD9937
You should close the output file (temp1.csv) before you can read data from it.
import csv
f = open("sample.csv", "r")
reader = csv.reader(f)
data = open("temp1.csv", "w")
w = csv.writer(data)
for row in reader:
my_row = []
my_row.append(row[0])
w.writerow(my_row)
data.close() # <--- Should close it before reading it in the same program !!
with open('temp1.csv', 'r') as file1:
with open('serialNumber.txt', 'r') as file2:
same = set(file1).intersection(file2)
with open('result.txt', 'w') as file_out:
for line in same:
file_out.write(line)
print line
Points regarding code:
data file handle is not closed. data.close() after writing to temp1.csv.
In your code, same = set(file1).intersection(file2), you are directly passing file handle file2 to intersection. It expects list. This is exact problem is. It should be same = set(file1.readlines()).intersection(file2.readlines())
Working Code:
import csv
f = open("sample.csv", "r")
reader = csv.reader(f)
data = open("temp1.csv", "wb")
w = csv.writer(data)
for row in reader:
my_row = []
if len(row) != 0:
my_row.append(row[0])
w.writerow(my_row)
#File should be closed
data.close()
with open('temp1.csv', 'r') as file1:
with open('serialNumber.txt', 'r') as file2:
tmp_list = (file1.readlines())
ser_list = (file2.readlines())
same = set(file1.readlines()).intersection(file2.readlines())
with open('result.txt', 'w') as file_out:
for line in same:
file_out.write(line)
Content of temp1.csv:
M11435TDS144
M11543TH4292
M11509TD9937
M11543TH4258
Content of result.txt :
M11543TH4258
M11543TH4292
M11435TDS144
You can use with for opening files sample.csv and temp1.csv as below.
import csv
with open("sample.csv") as f:
with open("temp1.csv",'wb') as data:
reader = csv.reader(f)
w = csv.writer(data)
for row in reader:
my_row = []
my_row.append(row[0])
w.writerow(my_row)
with open('temp1.csv', 'r') as file1:
with open('serialNumber.txt', 'r') as file2:
same = set(file1.readlines()).intersection(file2.readlines())
with open('result.txt', 'w') as file_out:
for line in same:
file_out.write(line)

Merging selected colums from multiple text file to one text file using Python

I have multiple text files with 4 columns (tab separated). Each file will have around 2000 rows. Using Python, how can i create a new file that looks like this?
file1column1 file1column4 file2column4 file3column4 ...fileNcolumn4
Thanks.
Here is the code i tried:
file_lists = ['file1.data', 'file2.data']
temp_data = []
for a_file in file_lists:
file_h = open(a_file)
a_list = []
csv_reader = csv.reader(file_h, delimiter='\t')
for row in csv_reader:
if afile == "file1.data":
a_list.extend([row[0], row[3]])
else:
a_list.append(row[3])
temp_data.append((n for n in a_list))
file_h.close()
with open('output.data', 'w') as output_file:
csv_writer = csv.writer(output_file, delimiter='\t')
for row in list(zip(*temp_data)):
csv_writer.writerow(row)
output_file.close()
Courtesy: Combining columns of multiple files in one file - Python
I am getting results in following format though:
file1column1 file2column4
file1column4 file2column4
file1column1 file2column4
file1column4 file2column4
What about:
file_lists = [('file1.data', 1), ('file1.data', 3), ('file2.data',3)]
temp_data = []
for a_file in file_lists:
file_h = open(a_file[0])
a_list = []
csv_reader = csv.reader(file_h, delimiter='\t')
for row in csv_reader:
a_list.append(row[a_file[1]])
temp_data.append((n for n in a_list))
file_h.close()
with open('output.data', 'w') as output_file:
csv_writer = csv.writer(output_file, delimiter='\t')
for row in list(zip(*temp_data)):
csv_writer.writerow(row)
You don't need to close file when you use with ... as

generate a header row using python's csv.writer

I have a bit of python code that produces a .csv file, however I don't know how to add column names, or a header row. Here is my code:
handle = open(sys.argv[1])
with open('protparams.csv', 'w') as fp:
writer = csv.writer(fp, delimiter=',')
for record in SeqIO.parse(handle, "fasta"):
seq = str(record.seq)
X = ProtParam.ProteinAnalysis(seq)
data = [seq,X.get_amino_acids_percent(),X.aromaticity(),X.gravy(),X.isoelectric_point(),X.secondary_structure_fraction(),X.molecular_weight(),X.instability_index()]
writer.writerow(data)
I have tried adding in something like:
writer = csv.writer(fp, delimiter=',',[seq,aa_percentage,aromaticity,gravy,isoelectric_point,secondary_structure_fraction,molecular_weight,instability_index])
but this obviously doesn't work
anyone have any ideas?
Write the headers before the loop:
handle = open(sys.argv[1])
with open('protparams.csv', 'w') as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerow(['heading1','heading2','heading3'])
for record in SeqIO.parse(handle, "fasta"):
seq = str(record.seq)
X = ProtParam.ProteinAnalysis(seq)
data = [seq,X.get_amino_acids_percent(),X.aromaticity(),X.gravy(),X.isoelectric_point(),X.secondary_structure_fraction(),X.molecular_weight(),X.instability_index()]
writer.writerow(data)

Attempting to merge three columns in CSV, updating original CSV

Some example data:
title1|title2|title3|title4|merge
test|data|here|and
test|data|343|AND
",3|data|343|and
My attempt at coding this:
import csv
import StringIO
storedoutput = StringIO.StringIO()
fields = ('title1', 'title2', 'title3', 'title4', 'merge')
with open('file.csv', 'rb') as input_csv:
reader = csv.DictReader(input_csv, fields, delimiter='|')
for counter, row in enumerate(reader):
counter += 1
#print row
if counter != 1:
for field in fields:
if field == "merge":
row['merge'] = ("%s%s%s" % (row["title1"], row["title3"], row["title4"]))
print row
storedoutput.writelines(','.join(map(str, row)) + '\n')
contents = storedoutput.getvalue()
storedoutput.close()
print "".join(contents)
with open('file.csv', 'rb') as input_csv:
input_csv = input_csv.read().strip()
output_csv = []
output_csv.append(contents.strip())
if "".join(output_csv) != input_csv:
with open('file.csv', 'wb') as new_csv:
new_csv.write("".join(output_csv))
Output should be
title1|title2|title3|title4|merge
test|data|here|and|testhereand
test|data|343|AND|test343AND
",3|data|343|and|",3343and
For your reference upon running this code the first print it prints the rows as I would hope then to appear in the output csv. However the second print prints the title row x times where x is the number of rows.
Any input or corrections or working code would be appreciated.
I think we can make this a lot simpler. Dealing with the rogue " was a bit of a nuisance, I admit, because you have to work hard to tell Python you don't want to worry about it.
import csv
with open('file.csv', 'rb') as input_csv, open("new_file.csv", "wb") as output_csv:
reader = csv.DictReader(input_csv, delimiter='|', quoting=csv.QUOTE_NONE)
writer = csv.DictWriter(output_csv, reader.fieldnames, delimiter="|",quoting=csv.QUOTE_NONE, quotechar=None)
merge_cols = "title1", "title3", "title4"
writer.writeheader()
for row in reader:
row["merge"] = ''.join(row[col] for col in merge_cols)
writer.writerow(row)
produces
$ cat new_file.csv
title1|title2|title3|title4|merge
test|data|here|and|testhereand
test|data|343|AND|test343AND
",3|data|343|and|",3343and
Note that even though you wanted the original file updated, I refused. Why? It's a bad idea, because then you can destroy your data while working on it.
How can I be so sure? Because that's exactly what I did when I first ran your code, and I know better. ;^)
That double quote in the last line is definitely messing up the csv.DictReader().
This works:
new_lines = []
with open('file.csv', 'rb') as f:
# skip the first line
new_lines.append(f.next().strip())
for line in f:
# the newline and split the fields
line = line.strip().split('|')
# exctract the field data you want
title1, title3, title4 = line[0], line[2], line[3]
# turn the field data into a string and append in to the rest
line.append(''.join([title1, title3, title4]))
# save the new line for later
new_lines.append('|'.join(line))
with open('file.csv', 'w') as f:
# make one long string and write it to the new file
f.write('\n'.join(new_lines))
import csv
import StringIO
stored_output = StringIO.StringIO()
with open('file.csv', 'rb') as input_csv:
reader = csv.DictReader(input_csv, delimiter='|', quoting=csv.QUOTE_NONE)
writer = csv.DictWriter(stored_output, reader.fieldnames, delimiter="|",quoting=csv.QUOTE_NONE, quotechar=None)
merge_cols = "title1", "title3", "title4"
writer.writeheader()
for row in reader:
row["merge"] = ''.join(row[col] for col in merge_cols)
writer.writerow(row)
contents = stored_output.getvalue()
stored_output.close()
print contents
with open('file.csv', 'rb') as input_csv:
input_csv = input_csv.read().strip()
if input_csv != contents.strip():
with open('file.csv', 'wb') as new_csv:
new_csv.write("".join(contents))

Categories