Remove newline character after specific words in csv? - python

I have a big csv file. After some items there is a newline character which is not supposed to be there. It is always after a specific item, let's say it's called 'foo'. I need to remove every newline character after foo. I figured out this is kind of what should happen:
for line in sys.stdin:
if line.split(",")[-1] == "foo":
line = line.rstrip()
How do I make sure I output the result back to the file?

You can't write line back to your original file but assuming you will use your script like python script.py < input_file.csv > output_file.csv you can simply print the lines you need:
import sys
for line in sys.stdin:
if line.split(",")[-1] == "foo":
line = line.rstrip()
# print() will append '\n' by default - we prevent it
print(line, end='')

I haven't tested this, but it should do what you need it to. This assumes there are no other items (other than foo) that has trailing white space that you don't want to strip. Otherwise, a simple conditional will fix that.
import csv
with open("/path/to/file", newline='') as f:
reader = csv.reader(f)
for row in reader:
for i, item in enumerate(row):
row[i] = item.rstrip()
with open("/path/to/file", 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(reader)

This answer just saves to a new csv file.
with open("test.csv", "r", newline="") as csvfile:
my_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
with open("new.csv", "w", newline="") as csvfile2:
last_line = []
writer = csv.writer(csvfile2, delimiter=',', quotechar='"')
for line in my_reader:
if last_line != []:
writer.writerow(last_line + line)
last_line = []
elif line[-1] == "foo":
last_line = line
else:
writer.writerow(line)
if last_line != []: # when the last line also contain "foo"
writer.writerow(last_line)
Tested on a test.csv file:
this,"is,a ",book
this,is,foo
oh,my
this,foo
And gained a new.csv file:
this,"is,a ",book
this,is,foo,oh,my
this,foo

Related

how to avoid quoting in csv while deleting symbols?

trying to delete symbol , from my csv file but I am getting quotes and also nothing deletes at all
import csv
import string
input_file = open('/Users/gfidarov/Desktop/crosscheckmmm/Sheet1', 'r')
output_file = open('/Users/gfidarov/Desktop/crosscheckmmm/Sheet01', 'w')
data = csv.reader(input_file)
writer = csv.writer(output_file, quoting=csv.QUOTE_NONE, dialect='excel')
specials = ','
for line in data:
line = [value.replace(specials, '') for value in line]
print(line)
writer.writerow(line)
input_file.close()
output_file.close()
When I try to do it I have this error_csv.Error: need to escape, but no escapechar set
Maybe you should try to consider each line as a string in order to replace/delete a character in it:
import csv
import string
input_file = open('/Users/gfidarov/Desktop/crosscheckmmm/Sheet1', 'r')
output_file = open('/Users/gfidarov/Desktop/crosscheckmmm/Sheet01', 'w')
data = csv.reader(input_file)
writer = csv.writer(output_file,quoting=csv.QUOTE_ALL)# dialect='excel')
specials = ','
for line in data:
line = str(line)
new_line = str.replace(line,specials,'')
writer.writerow(new_line.split(','))
input_file.close()
output_file.close()
For the problem with escapechar, you can temporarily set an escapechar, then unset it after all your processing:
writer = csv.writer(output_file,quoting=csv.QUOTE_ALL, escapechar='\\')# dialect='excel')

How to read a CSV and adapt + write every row to another CSV?

I tried this but it just writes "lagerungskissen kleinkind,44" several times instead of transferring every row.
keyword = []
rank = []
rank = list(map(int, rank))
data = []
with open("keywords.csv", "r") as file:
for line in file:
data = line.strip().replace('"', '').split(",")
keyword = data[0]
rank = data[3]
import csv
with open("mynew.csv", "w", newline="") as f:
thewriter = csv.writer(f)
thewriter.writerow(["Keyword", "Rank"])
for row in keyword:
thewriter.writerow([keyword, rank])
It should look like this
This is writing the same line in your output CSV because the final block is
for row in keyword:
thewriter.writerow([keyword, rank])
Note that the keyword variable doesn't change in the loop, but the row does. You're writing that same [keyword, rank] line len(keyword) times.
I would use the csv package to do the reading and the writing for this. Something like
import csv
input_file = '../keywords.csv'
output_file = '../mynew.csv'
# open the files
fIn = open(input_file, 'r', newline='')
fOut = open(output_file, 'w')
csvIn = csv.reader(fIn, quotechar='"') # check the keyword args in the docs!
csvOut = csv.writer(fOut)
# write a header, then write each row one at a time
csvOut.writerow(['Keyword', 'Rank'])
for row in csvIn:
keyword = row[0]
rank = row[3]
csvOut.writerow([keyword, rank])
# and close the files
fOut.close()
fIn.close()
As as side note, you could write the above using the with context manager (e.g. with open(...) as file:). The answer here shows how to do it with multiple files (in this case fIn and fOut).

Reformat CSV according to certain field using python

http://example.com/item/all-atv-quad.html,David,"Punjab",+123456789123
http://example.com/item/70cc-2014.html,Qubee,"Capital",+987654321987
http://example.com/item/quad-bike-zenith.html,Zenith,"UP",+123456789123
I have this test.csv where I have scraped a few items from certain site but the thing is "number" field has redundancy. So I somehow need to remove a row that has the same number as before. This is just the example file, In the real file some numbers are repeated more than 50+ times.
import csv
with open('test.csv', newline='') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',')
for column in csvreader:
"Some logic here"
if (column[3] == "+123456789123"):
print (column[0])
"or here"
I need reformated csv like this:
http://example.com/item/all-atv-quad.html,David,"Punjab",+123456789123
http://example.com/item/70cc-2014.html,Qubee,"Capital",+987654321987
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
def direct():
seen = set()
with open("test.csv") as infile, open("formatted.csv", 'w') as outfile:
for line in infile:
parts = line.rstrip().split(',')
number = parts[-1]
if number not in seen:
seen.add(number)
outfile.write(line)
def using_pandas():
"""Alternatively, use Pandas"""
df = pd.read_csv("test.csv", header=None)
df = df.drop_duplicates(subset=[3])
df.to_csv("formatted_pandas.csv", index=None, header=None)
def main():
direct()
using_pandas()
if __name__ == "__main__":
main()
This would filter out duplicates:
seen = set()
for line in csvreader:
if line[3] in seen:
continue
seen.add(line[3])
# write line to output file
And the csv read and write logic:
with open('test.csv') as fobj_in, open('test_clean.csv', 'w') as fobj_out:
csv_reader = csv.reader(fobj_in, delimiter=',')
csv_writer = csv.writer(fobj_out, delimiter=',')
seen = set()
for line in csvreader:
if line[3] in seen:
continue
seen.add(line[3])
csv_writer.writerow(line)

appending a data in a specific line of a text file in Python?

Let's say I have this textfile: date.txt. month|day|year
January|20|2014
February|10|
March|5|2013
I want to put 2012 after February|10|. how can i do that?
You need to read the file into memory, modify the desired line and write back the file.
temp = open('temp', 'wb')
with open('date.txt', 'r') as f:
for line in f:
if line.startswith('February'):
line = line.strip() + '2012\n'
temp.write(line)
temp.close()
shutils.move('temp', 'data.txt')
If you don't want to use a temporary file:
with open('date.txt', 'r+') as f: #r+ does the work of rw
lines = f.readlines()
for i, line in enumerate(lines):
if line.startswith('February'):
lines[i] = lines[i].strip() + '2012\n'
f.seek(0)
for line in lines:
f.write(line)
You can use the csv module, for example:
import csv
data = [
"January|20|2014",
"February|10|",
"March|5|2013"
]
reader = csv.reader(data, delimiter="|")
for line in reader:
line = [i if i != "" else "2012" for i in line]
print(line)
Please note: csv.reader() take as argument any iterable object. So, you can easily pass it a file object

Attempting to merge three columns in CSV, updating original CSV

Some example data:
title1|title2|title3|title4|merge
test|data|here|and
test|data|343|AND
",3|data|343|and
My attempt at coding this:
import csv
import StringIO
storedoutput = StringIO.StringIO()
fields = ('title1', 'title2', 'title3', 'title4', 'merge')
with open('file.csv', 'rb') as input_csv:
reader = csv.DictReader(input_csv, fields, delimiter='|')
for counter, row in enumerate(reader):
counter += 1
#print row
if counter != 1:
for field in fields:
if field == "merge":
row['merge'] = ("%s%s%s" % (row["title1"], row["title3"], row["title4"]))
print row
storedoutput.writelines(','.join(map(str, row)) + '\n')
contents = storedoutput.getvalue()
storedoutput.close()
print "".join(contents)
with open('file.csv', 'rb') as input_csv:
input_csv = input_csv.read().strip()
output_csv = []
output_csv.append(contents.strip())
if "".join(output_csv) != input_csv:
with open('file.csv', 'wb') as new_csv:
new_csv.write("".join(output_csv))
Output should be
title1|title2|title3|title4|merge
test|data|here|and|testhereand
test|data|343|AND|test343AND
",3|data|343|and|",3343and
For your reference upon running this code the first print it prints the rows as I would hope then to appear in the output csv. However the second print prints the title row x times where x is the number of rows.
Any input or corrections or working code would be appreciated.
I think we can make this a lot simpler. Dealing with the rogue " was a bit of a nuisance, I admit, because you have to work hard to tell Python you don't want to worry about it.
import csv
with open('file.csv', 'rb') as input_csv, open("new_file.csv", "wb") as output_csv:
reader = csv.DictReader(input_csv, delimiter='|', quoting=csv.QUOTE_NONE)
writer = csv.DictWriter(output_csv, reader.fieldnames, delimiter="|",quoting=csv.QUOTE_NONE, quotechar=None)
merge_cols = "title1", "title3", "title4"
writer.writeheader()
for row in reader:
row["merge"] = ''.join(row[col] for col in merge_cols)
writer.writerow(row)
produces
$ cat new_file.csv
title1|title2|title3|title4|merge
test|data|here|and|testhereand
test|data|343|AND|test343AND
",3|data|343|and|",3343and
Note that even though you wanted the original file updated, I refused. Why? It's a bad idea, because then you can destroy your data while working on it.
How can I be so sure? Because that's exactly what I did when I first ran your code, and I know better. ;^)
That double quote in the last line is definitely messing up the csv.DictReader().
This works:
new_lines = []
with open('file.csv', 'rb') as f:
# skip the first line
new_lines.append(f.next().strip())
for line in f:
# the newline and split the fields
line = line.strip().split('|')
# exctract the field data you want
title1, title3, title4 = line[0], line[2], line[3]
# turn the field data into a string and append in to the rest
line.append(''.join([title1, title3, title4]))
# save the new line for later
new_lines.append('|'.join(line))
with open('file.csv', 'w') as f:
# make one long string and write it to the new file
f.write('\n'.join(new_lines))
import csv
import StringIO
stored_output = StringIO.StringIO()
with open('file.csv', 'rb') as input_csv:
reader = csv.DictReader(input_csv, delimiter='|', quoting=csv.QUOTE_NONE)
writer = csv.DictWriter(stored_output, reader.fieldnames, delimiter="|",quoting=csv.QUOTE_NONE, quotechar=None)
merge_cols = "title1", "title3", "title4"
writer.writeheader()
for row in reader:
row["merge"] = ''.join(row[col] for col in merge_cols)
writer.writerow(row)
contents = stored_output.getvalue()
stored_output.close()
print contents
with open('file.csv', 'rb') as input_csv:
input_csv = input_csv.read().strip()
if input_csv != contents.strip():
with open('file.csv', 'wb') as new_csv:
new_csv.write("".join(contents))

Categories