Python read CSV file, and write to another skipping columns - python

I have CSV input file with 18 columns
I need to create new CSV file with all columns from input except column 4 and 5
My function now looks like
def modify_csv_report(input_csv, output_csv):
begin = 0
end = 3
with open(input_csv, "r") as file_in:
with open(output_csv, "w") as file_out:
writer = csv.writer(file_out)
for row in csv.reader(file_in):
writer.writerow(row[begin:end])
return output_csv
So it reads and writes columns number 0 - 3, but i don't know how skip column 4,5 and continue from there

You can add the other part of the row using slicing, like you did with the first part:
writer.writerow(row[:4] + row[6:])
Note that to include column 3, the stop index of the first slice should be 4. Specifying start index 0 is also usually not necessary.
A more general approach would employ a list comprehension and enumerate:
exclude = (4, 5)
writer.writerow([r for i, r in enumerate(row) if i not in exclude])

If your CSV has meaningful headers an alternative solution to slicing your rows by indices, is to use the DictReader and DictWriter classes.
#!/usr/bin/env python
from csv import DictReader, DictWriter
data = '''A,B,C
1,2,3
4,5,6
6,7,8'''
reader = DictReader(data.split('\n'))
# You'll need your fieldnames first in a list to ensure order
fieldnames = ['A', 'C']
# We'll also use a set for efficient lookup
fieldnames_set = set(fieldnames)
with open('outfile.csv', 'w') as outfile:
writer = DictWriter(outfile, fieldnames)
writer.writeheader()
for row in reader:
# Use a dictionary comprehension to iterate over the key, value pairs
# discarding those pairs whose key is not in the set
filtered_row = dict(
(k, v) for k, v in row.iteritems() if k in fieldnames_set
)
writer.writerow(filtered_row)

This is what you want:
import csv
def remove_csv_columns(input_csv, output_csv, exclude_column_indices):
with open(input_csv) as file_in, open(output_csv, 'w') as file_out:
reader = csv.reader(file_in)
writer = csv.writer(file_out)
writer.writerows(
[col for idx, col in enumerate(row)
if idx not in exclude_column_indices]
for row in reader)
remove_csv_columns('in.csv', 'out.csv', (3, 4))

Related

csv skipping appending data skips rows

I have python code for appending data to the same csv, but when I append the data, it skips rows, and starts from row 15, instead from row 4
import csv
with open('csvtask.csv', 'r') as csv_file:
csv_reader = csv.DictReader(csv_file)
ls = []
for line in csv_reader:
if len(line['Values'])!= 0:
ls.append(int(line['Values']))
new_ls = ['','','']
for i in range(len(ls)-1):
new_ls.append(ls[i+1]-ls[i])
print(new_ls)
with open('csvtask.csv','a',newline='') as new_file:
csv_writer = csv.writer(new_file)
for i in new_ls:
csv_writer.writerow(('','','','',i))
new_file.close()
Here is the image
It's not really feasible to update a file at the same time you're reading it, so a common workaround it to create a new file. The following does that while preserving the fieldnames in the origin file. The new column will be named Diff.
Since there's no previous value to use to calculate a difference for the first row, the rows of the files are processed using the built-in enumerate() function which provides a value each time it's called which provides the index of the item in the sequence as well as the item itself as the object is iterated. You can use the index to know whether the current row is the first one or not and handle in a special way.
import csv
# Read csv file and calculate values of new column.
with open('csvtask.csv', 'r', newline='') as file:
reader = csv.DictReader(file)
fieldnames = reader.fieldnames # Save for later.
diffs = []
prev_value = 0
for i, row in enumerate(reader):
row['Values'] = int(row['Values']) if row['Values'] else 0
diff = row['Values'] - prev_value if i > 0 else ''
prev_value = row['Values']
diffs.append(diff)
# Read file again and write an updated file with the column added to it.
fieldnames.append('Diff') # Name of new field.
with open('csvtask.csv', 'r', newline='') as inp:
reader = csv.DictReader(inp)
with open('csvtask_updated.csv', 'w', newline='') as outp:
writer = csv.DictWriter(outp, fieldnames)
writer.writeheader()
for i, row in enumerate(reader):
row.update({'Diff': diffs[i]}) # Add new column.
writer.writerow(row)
print('Done')
You can use the DictWriter function like this:-
header = ["data", "values"]
writer = csv.DictWriter(file, fieldnames = header)
data = [[1, 2], [4, 6]]
writer.writerows(data)

Read all columns from CSV file?

I am trying to read in a CSV file and then take all values from each column and put into a separate list. I do not want the values by row. Since the CSV reader only allows to loop through the file once, I am using the seek() method to go back to the beginning and read the next column. Besides using a Dict mapping, is there a better way to do this?
infile = open(fpath, "r")
reader = csv.reader(infile)
NOUNS = [col[0] for col in reader]
infile.seek(0) # <-- set the iterator to beginning of the input file
VERBS = [col[1] for col in reader]
infile.seek(0)
ADJECTIVES = [col[2] for col in reader]
infile.seek(0)
SENTENCES = [col[3] for col in reader]
Something like this would do it in one pass:
kinds = NOUNS, VERBS, ADJECTIVES, SENTENCES = [], [], [], []
with open(fpath, "r") as infile:
for cols in csv.reader(infile):
for i, kind in enumerate(kinds):
kind.append(cols[i])
You could feed the reader to zip and unpack it to variables as you wish.
import csv
with open('input.csv') as f:
first, second, third, fourth = zip(*csv.reader(f))
print('first: {}, second: {}, third: {}, fourth: {}'.format(
first, second, third, fourth
))
With following input:
1,2,3,4
A,B,C,D
It will produce output:
first: ('1', 'A'), second: ('2', 'B'), third: ('3', 'C'), fourth: ('4', 'D')
This works assuming you know exactly how many columns are in the csv (and there isn't a header row).
NOUNS = []
VERBS = []
ADJECTIVES = []
SENTENCES = []
with open(fpath, "r") as infile:
reader = csv.reader(infile)
for row in reader:
NOUNS.append(row[0])
VERBS.append(row[1])
ADJECTIVES.append(row[2])
SENTENCES.append(row[3])
If you don't know the column headers, you're going to have to be clever and read off the first row, make lists for every column you encounter, and loop through every new row and insert in the appropriate list. You'll probably need to do a list of lists.
If you don't mind adding a dependency, use Pandas. Use a DataFrame and the method read_csv(). Access each column using the column name i.e.
df = pandas.DataFrame.read_csv(fpath)
print df['NOUN']
print df['VERBS']
I am not sure why you dont want to use dict mapping. This is what I end up doing
Data
col1,col2,col3
val1,val2,val3
val4,val5,val6
Code
import csv
d = dict()
with open("abc.text") as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
for key, value in row.items():
if d.get(key) is None:
d[key] = [value]
else:
d[key].append(value)
print d
{'col2': ['val2', 'val5'], 'col3': ['val3', 'val6'], 'col1': ['val1', 'val4']}

Writing intersection data to new CSV

I have 2 CSV files which have a list of unique words. After I complete my intersection on them I get the results, but when I try to write it to a new file it creates a very large sized file of almost 155MB, when it should be well below 2MB.
Code:
alist, blist = [], []
with open("SetA-unique.csv", "r") as fileA:
reader = csv.reader(fileA, delimiter=',')
for row in reader:
alist += row
with open("SetB-unique.csv", "r") as fileB:
reader = csv.reader(fileB, delimiter=',')
for row in reader:
blist += row
first_set = set(alist)
second_set = set(blist)
res = (first_set.intersection(second_set))
writer = csv.writer(open("SetA-SetB.csv", 'w'))
for row in res:
writer.writerow(res)
You're writing the entire set res to the file on each iteration. You probably want to write the rows instead:
for row in res:
writer.writerow([row])
Apart from writing the whole set each iteration you also don't need to create multiple sets and lists, you can use itertools.chain:
from itertools import chain
with open("SetA-unique.csv") as file_a, open("SetB-unique.csv") as file_b,open("SetA-SetB.csv", 'w') as inter :
r1 = csv.reader(file_a)
r2 = csv.reader(file_b)
for word in set(chain.from_iterable(r1)).intersection(chain.from_iterable(r2)):
inter.write(word)+"\n"
If you are just writing words there is also no need to use csv.writer just use file.write as above.
If you are actually trying do the comparison row wise, you should not be creating a flat iterable of words, you can imap to tuples:
from itertools import imap
with open("SetA-unique.csv") as file_a, open("SetB-unique.csv") as file_b,open("SetA-SetB.csv", 'w') as inter :
r1 = csv.reader(file_a)
r2 = csv.reader(file_b)
writer = csv.writer(inter)
for row in set(imap(tuple, r1).intersection(imap(tuple, r2)):
writer.writerow(row)
And if you only have one word per line you don't need the csv lib at all.
from itertools import imap
with open("SetA-unique.csv") as file_a, open("SetB-unique.csv") as file_b,open("SetA-SetB.csv", 'w') as inter :
for word in set(imap(str.strip, file_a)).intersection(imap(str.strip, file_b)):
inter.write(word) + "\n"

Read and Compare 2 CSV files on a row and column basis

I have two CSV files. data.csv and data2.csv.
I would like to first of Strip the two data files down to the data I am interested in. I have figured this part out with data.csv. I would then like to compare by row making sure that if a row is missing to add it.
Next I want to look at column 2. If there is a value there then I want to write to column 3 if there is data in column 3 then write to 4, etc.
My current program looks like sow. Need some guidance
Oh and I am using Python V3.4
__author__ = 'krisarmstrong'
#!/usr/bin/python
import csv
searched = ['aircheck', 'linkrunner at', 'onetouch at']
def find_group(row):
"""Return the group index of a row
0 if the row contains searched[0]
1 if the row contains searched[1]
etc
-1 if not found
"""
for col in row:
col = col.lower()
for j, s in enumerate(searched):
if s in col:
return j
return -1
inFile = open('data.csv')
reader = csv.reader(inFile)
inFile2 = open('data2.csv')
reader2 = csv.reader(inFile2)
outFile = open('data3.csv', "w")
writer = csv.writer(outFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
header = next(reader)
header2 = next(reader2)
"""Built a list of items to sort. If row 12 contains 'LinkRunner AT' (group 1),
one stores a triple (1, 12, row)
When the triples are sorted later, all rows in group 0 will come first, then
all rows in group 1, etc.
"""
stored = []
writer.writerow([header[0], header[3]])
for i, row in enumerate(reader):
g = find_group(row)
if g >= 0:
stored.append((g, i, row))
stored.sort()
for g, i, row in stored:
writer.writerow([row[0], row[3]])
inFile.close()
outFile.close()
Perhaps try:
import csv
with open('some.csv', 'rb') as f:
reader = csv.reader(f)
for row in reader:
col1.append(row[0])
col2.append(row[1])
for i in xrange(len(col1))
if col1[i] == '':
#thing to do if there is nothing for col1
if col2[i] == '':
#thing to do if there is nothing for col2
This is a start at "making sure that if a row is missing to add it".

Update CSV file by column name using Python

I have the csv file as follows:
product_name, product_id, category_id
book, , 3
shoe, 3, 1
lemon, 2, 4
I would like to update product_id of each row by providing the column name using python's csv library.
So for an example if I pass:
update_data = {"product_id": [1,2,3]}
then the csv file should be:
product_name, product_id, category_id
book, 1, 3
shoe, 2, 1
lemon, 3, 4
You can use your existing dict and iter to take items in order, eg:
import csv
update_data = {"product_id": [1,2,3]}
# Convert the values of your dict to be directly iterable so we can `next` them
to_update = {k: iter(v) for k, v in update_data.items()}
with open('input.csv', 'rb') as fin, open('output.csv', 'wb') as fout:
# create in/out csv readers, skip intial space so it matches the update dict
# and write the header out
csvin = csv.DictReader(fin, skipinitialspace=True)
csvout = csv.DictWriter(fout, csvin.fieldnames)
csvout.writeheader()
for row in csvin:
# Update rows - if we have something left and it's in the update dictionary,
# use that value, otherwise we use the value that's already in the column.
row.update({k: next(to_update[k], row[k]) for k in row if k in to_update})
csvout.writerow(row)
Now - this assumes that each new column value goes to the row number and that the existing values should be used after that. You could change that logic to only use new values when the existing value is blank for instance (or whatever other criteria you wish).
(assuming you're using 3.x)
Python has a CSV module in the standard library which helps read and amend CSV files.
Using that I'd find the index for the column you are after and store it in the dictionary you've made. Once that has been found it's simply a matter of popping the list item into each row.
import csv
update_data = {"product_id": [None, [1,2,3]]}
#I've nested the original list inside another so that we can hold the column index in the first position.
line_no = 0
#simple counter for the first step.
new_csv = []
#Holds the new rows for when we rewrite the file.
with open('test.csv', 'r') as csvfile:
filereader = csv.reader(csvfile)
for line in filereader:
if line_no == 0:
for key in update_data:
update_data[key][0] = line.index(key)
#This finds us the columns index and stores it for us.
else:
for key in update_data:
line[update_data[key][0]] = update_data[key][1].pop(0)
#using the column index we enter the new data into the correct place whilst removing it from the input list.
new_csv.append(line)
line_no +=1
with open('test.csv', 'w') as csvfile:
filewriter = csv.writer(csvfile)
for line in new_csv:
filewriter.writerow(line)

Categories