I am trying to pull in data from an input file and iterate over a symbol file to create output for an output file but my code is creating an unwanted duplicate in the output file. The input file is very big so I need to filter the input first before I reference it against the symbol (city/state) file to generate the output.
i_file = ('InputFile.csv')
o_file = ('OutputFile.csv')
symbol_file = ('SymbolFile.csv')
City = 'Tampa'
State = 'FL'
with open(symbol_file, 'r') as symfile:
with open(i_file, 'r') as infile:
with open(o_file, 'w') as outfile:
reader = csv.reader(infile)
symbol = csv.reader(symfile)
writer = csv.writer(outfile, delimiter = ',')
for row in reader:
if (row[2] == city and row[3] == state):
for line in symbol:
if (row[4] == line[0]):
nline = ([str(city)] + [str(line[3])])
writer.writerow(nline)
symfile.seek(0)
I only want one line for every line in the input file IF there is a matching line in the symbol file.
Try it like this then:
i_file = 'InputFile.csv'
o_file = 'OutputFile.csv'
symbol_file = 'SymbolFile.csv'
city = 'Tampa'
state = 'FL'
# load the symbols from the symbol file and store them in a dictionary
symbols = {}
with open(symbol_file, 'r') as symfile:
for line in csv.reader(symfile):
# key is line[0] which is the thing we match against
# value is line[3] which appears to be the only thing of interest later
symbols[line[0]] = line[3]
# now read the other files
with open(i_file, 'r') as infile, open(o_file, 'w') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile, delimiter = ',')
for row in reader:
# check if `row[4] in symbols`
# which essentially checks whether row[4] is equal to a line[0] in the symbols file
if row[2] == city and row[3] == state and row[4] in symbols:
# get the symbol
symbol = symbols[row[4]]
# write output
nline = [city, symbol]
writer.writerow(nline)
Related
I have a text file which contains this data (items corresponds to code,entry1,entry2) :
a,1,2
b,2,3
c,4,5
....
....
Here a,b,c.. will be unique always
Every time I read this file in python to either create a new entry for example d,6,7 or to update existing values: say a,1,2 to a,4,3.
I use the following code :
data = ['a',5,6]
datastring = ''
for d in data
datastring = datastring + str(d) + ','
try:
with open("opfile.txt", "a") as f:
f.write(datastring + '\n')
f.close()
return(True)
except:
return(False)
This appends any data as a new entry.
I am trying something like this which checks the first character of each line:
f = open("opfile.txt", "r")
for x in f:
if(x[0] == username):
pass
I don't know how to club these two so that a check will be done on first character(lets say it as id) and if an entry with id is already in the file, then it should be replaced with new data and all other data remains same else it will be entered as new line item.
Read the file into a dictionary that uses the first field as keys. Update the appropriate dictionary, then write it back.
Use the csv module to parse and format the file.
import csv
data = ['a',5,6]
with open("opfile.txt", "r", newline='') as infile:
incsv = csv.reader(infile)
d = {row[0]: row for row in incsv if len(row) != 0}
d[data[0]] = data
with open("opfile.txt", "w") as outfile:
outcsv = csv.writer(outfile)
outcsv.writerows(d.values())
first append all new row to the file.
second, try using write to update rows in your file
def update_record(file_name, field1, field2, field3):
with open(file_name, 'r') as f:
lines = f.readlines()
with open(file_name, 'w') as f:
for line in lines:
if field1 in line:
f.write(field1 + ',' + field2 + ',' + field3 + '\n')
else:
f.write(line)
I'm trying to use Python to copy lines from one csv file to another and add data to a new column in the process. The data is being copied correctly to the new file, but it's all being copied to the same line in the new file.
file = "C:/original_file.csv"
nf = "C:/file_updated.csv"
i = 0
with open(file, 'r') as origFile:
with open(nf, 'w') as newFile:
lineList = []
for line in origFile:
strippedLine = line.strip()
lineList = strippedLine.split(',')
lineList.append("C:/ATT" + str(i) + "_PHOTO 1.jpg")
lineStr = str(lineList)
lineStr = lineStr.replace("'", "")
newFile.write(lineStr)
print lineList
i += 1
origFile.close()
newFile.close()
How can I make it so that each line from the first file copies to a separate line of the new file?
file = "C:/original_file.csv"
nf = "C:/file_updated.csv"
i = 0
with open(file, 'r') as origFile:
with open(nf, 'w') as newFile:
lineList = []
for line in origFile:
strippedLine = line.strip()
lineList = strippedLine.split(',')
lineList.append("C:/ATT" + str(i) + "_PHOTO 1.jpg")
lineStr = str(lineList)
lineStr = lineStr.replace("'", "")
newFile.write(lineStr)
newFile.write('\n') #Insert a new line
print lineList
i += 1
origFile.close()
newFile.close()
No need to install pandas, the built-in csv library is great for this!!
$ cat tmp.csv
first,second
third,fourth
import csv
to_read = "./tmp.csv"
to_write = "./tmp2.csv"
with open(to_read, newline="") as to_read_fp, open(to_write, "w", newline="") as to_write_fp:
reader = csv.reader(to_read_fp)
writer = csv.writer(to_write_fp)
for count, row in enumerate(reader):
row.append(f"C:/ATT{count}_PHOTO 1.jpg")
writer.writerow(row)
$ cat tmp2.csv
first,second,C:/ATT0_PHOTO 1.jpg
third,fourth,C:/ATT1_PHOTO 1.jpg
If you want to do it without any imports you could try something like this which adds a new column with the header New Field.
Of course it assumes the original CSV has a header row.
file = "original_file.csv"
nf = "file_updated.csv"
with open(file, 'r') as origFile:
data = [line.strip().split(',') for line in origFile.readlines()]
header = data[0]
data = data[1:]
header.append('New Field')
data = [line + [f'C:/ATT{idx}_PHOTO 1.jpg'] for idx, line in enumerate(data)]
data = [','.join(line) for line in [header]+data]
with open(nf, 'w') as newFile:
newFile.writelines('\n'.join(data))
"""
SAMPLE INPUT
Field1,Field2
Data1,Data2
Data3,Data4
SAMPLE OUTPUT
Field1,Field2,New Field
Data1,Data2,C:/ATT0_PHOTO 1.jpg
Data3,Data4,C:/ATT1_PHOTO 1.jpg
"""
I have the following input file with a header row:
test_in.csv
LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
-72.5708234,41.4155142,39,HICKORY LA,,,,,,,8a0df668e0d49b02
-72.5647745,41.4160301,1213,KILLINGWORTH RD,,,,,,,b3ecaab86e476f46
I need to replace any of the columns with a specified string
for example CITY column's data should be replaced from "" to "MyCity"
My code only outputs the header and first row
python test_forcefld.py test_in.csv MyCity CITY out_test.csv
import csv
import sys
in_file_name = sys.argv[1]
force_data = sys.argv[2]
force_fld = sys.argv[3]
out_file_name = sys.argv[4]
# First read top row/header from input file
fieldnames = []
for filename in [in_file_name]:
with open(filename, "rb") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
fieldnames.append(h)
#print headers to output file
with open(out_file_name, 'w') as fou:
dw = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw.writeheader()
f_in2 = open(in_file_name, "rb")
reader2 = csv.DictReader(f_in2) # Uses the field names in this file
datarow = next(reader2)
datarow[force_fld] = force_data
with open(out_file_name, 'wa') as fou:
dw2 = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw2.writeheader()
dw2.writerow(data row)
Output shows
LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
-72.5708234,41.4155142,39,HICKORY LA,,MyCity,,,,,8a0df668e0d49b02
Your code is a little difficult to read, but assuming datarow is a dictionary containing your records:
In your last row, change
dw2.writerow(datarow)
Into
dw2.writerows(datarow)
While you're at it, you should also consider using datarow.keys() for your fieldnames, for conciseness.
This should do it, you just need pandas:
import pandas as pd
df = pd.read_csv(in_file_name, sep=',')
df['CITY'].fillna('MyCity', inplace=True)
And to save it:
df.to_csv(out_file_name)
You can try somthing like this in order to have your desired file:
I'm assuming your input file is called f_input.txt and your output file is called f_output.txt:
data = list(k.rstrip().split(',') for k in open("f_input.txt", 'r'))
with open("f_output.txt", 'a+') as f:
f.write(",".join(data[0]) + '\n')
for k in data[1:]:
# Modify the positions of k[:n] + your data + k[n+1]
# if you need to handle another position
f.write(",".join(k[:6]) + "MyCity" + ",".join(k[7:]) + "\n")
This worked in the end:
import csv
import sys
in_file_name = sys.argv[1]
force_data = sys.argv[2]
force_fld = sys.argv[3]
out_file_name = sys.argv[4]
# First read top row/header from input file
fieldnames = []
for filename in [in_file_name]:
with open(filename, "rb") as f_in:
reader = csv.reader(f_in)
headers = next(reader)
for h in headers:
fieldnames.append(h)
f_in2 = open(in_file_name, "r")
#print headers to output file
fou = open(out_file_name, 'wa')
dw = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw.writeheader()
reader2 = csv.DictReader(f_in2) # Uses the field names in this file
for row in reader2:
row[force_fld] = force_data
dw2 = csv.DictWriter(fou, delimiter=',', fieldnames=fieldnames)
dw2.writerow(row)
This is one file result.csv:
M11251TH1230
M11543TH4292
M11435TDS144
This is another file sample.csv:
M11435TDS144,STB#1,Router#1
M11543TH4292,STB#2,Router#1
M11509TD9937,STB#3,Router#1
M11543TH4258,STB#4,Router#1
Can I write a Python program to compare both the files and if line in result.csv matches with the first word in the line in sample.csv, then append 1 else append 0 at every line in sample.csv?
import pandas as pd
d1 = pd.read_csv("1.csv",names=["Type"])
d2 = pd.read_csv("2.csv",names=["Type","Col2","Col3"])
d2["Index"] = 0
for x in d1["Type"] :
d2["Index"][d2["Type"] == x] = 1
d2.to_csv("3.csv",header=False)
Considering "1.csv" and "2.csv" are your csv input files and "3.csv" is the result you needed
The solution using csv.reader and csv.writer (csv module):
import csv
newLines = []
# change the file path to the actual one
with open('./data/result.csv', newline='\n') as csvfile:
data = csv.reader(csvfile)
items = [''.join(line) for line in data]
with open('./data/sample.csv', newline='\n') as csvfile:
data = list(csv.reader(csvfile))
for line in data:
line.append(1 if line[0] in items else 0)
newLines.append(line)
with open('./data/sample.csv', 'w', newline='\n') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(newLines)
The sample.csv contents:
M11435TDS144,STB#1,Router#1,1
M11543TH4292,STB#2,Router#1,1
M11509TD9937,STB#3,Router#1,0
M11543TH4258,STB#4,Router#1,0
With only one column, I wonder why you made it as a result.csv. If it is not going to have any more columns, a simple file read operation would suffice. Along with converting the data from result.csv to dictionary will help in quick run as well.
result_file = "result.csv"
sample_file = "sample.csv"
with open(result_file) as fp:
result_data = fp.read()
result_dict = dict.fromkeys(result_data.split("\n"))
"""
You can change the above logic, in case you have very few fields on csv like this:
result_data = fp.readlines()
result_dict = {}
for result in result_data:
key, other_field = result.split(",", 1)
result_dict[key] = other_field.strip()
"""
#Since sample.csv is a real csv, using csv reader and writer
with open(sample_file, "rb") as fp:
sample_data = csv.reader(fp)
output_data = []
for data in sample_data:
output_data.append("%s,%d" % (data, data[0] in result_dict))
with open(sample_file, "wb") as fp:
data_writer = csv.writer(fp)
data_writer.writerows(output_data)
The following snippet of code will work for you
import csv
with open('result.csv', 'rb') as f:
reader = csv.reader(f)
result_list = []
for row in reader:
result_list.extend(row)
with open('sample.csv', 'rb') as f:
reader = csv.reader(f)
sample_list = []
for row in reader:
if row[0] in result_list:
sample_list.append(row + [1])
else:
sample_list.append(row + [0]
with open('sample.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(sample_list)
I'm using this information (downloaded the file to my computer) http://www.genome.gov/admin/gwascatalog.txt
and wrote this
import csv
path = '/Users/mtchavez/Documents/ALS/Diseasome/GWAS.txt'
read_file = open(path)
reader = csv.reader(read_file, delimiter = '\t')
fieldnames = reader.next()
rows = list(reader)
read_file.close()
write_file = open('datatest.csv', 'wb')
writer = csv.writer(write_file, delimiter = '\t')
writer.writerow(('disease', 'genes'))
disease_gene = dict()
for row in rows:
disease = row[7]
reported_genes = row[13]
but I get an error message:
File "data1.py", line 18, in <module>
disease = row[7]
IndexError: list index out of range
There is an empty line at the end of this csv file and it will create an empty row. Delete the last line and the code works fine.
Try filtering for empty lines:
for row in rows:
if not row: continue
disease = row[7]
reported_genes = row[13]
Or more specifically, filter for the desired length:
for row in rows:
if len(row) != EXPECTED_LENGTH_OF_RECORD: continue
disease = row[7]
reported_genes = row[13]