Merge two rows in a csv file in Python - python

I am very new to Python and I am trying to do a very simple merge of every two lines in a csv file. Basically I want it like this:
Input:
[H1],1,2,3,4
[H2],5,6,7,8
[H1],a,b,c,d
[H2],e,f,g,h
Output:
[H1],1,2,3,4,[H2],5,6,7,8
[H1],a,b,c,d,[H2],e,f,g,h
This is a brief example, but the csv file has up to 167 columns with the two lines combined. This is what I have:
import csv
f = open("sample.csv",'rU').read().split("\n")
reader = csv.reader(f)
for row in reader:
if row[0].startswith("[H1]"):
i=[]
while i<167: n = row.append([",".join(row[i]+row[i+1])])
print n
However when I run it I get the following error:
print n
NameError: name 'n' is not defined
Any help is appreciated, thanks.

Input i.csv:
1,2,3
foo,bar,baz
4,5,6
qux,quux.quuux
Python codce:
with open("i.csv") as f:
reader = csv.reader(f)
i = 0
for row in reader:
if i % 2 == 0:
newRow = row
else:
newRow = newRow + row
print(newRow)
i = i + 1
Output:
['1', '2', '3', 'foo', 'bar', 'baz']
['4', '5', '6', 'qux', 'quux', 'quuux']

import csv
f = open("sample.csv",'rU').read().split("\n")
reader = csv.reader(f)
i = 0
for row in reader:
if i % 2 == 0:
line = row
else:
line = line + row
print ", ".join(line)
i += 1

Writing while i<167: n = row.append([",".join(row[i]+row[i+1])]) is like writing:
while i<167:
n = row.append([",".join(row[i]+row[i+1])])
So the scope of n is the loop block. Your print n is out of that scope thus raising NameError.
You could add n = None right before the while:
n = None
while i<167: n = row.append([",".join(row[i]+row[i+1])])
print n
Or move print n into the loop block:
while i<167:
n = row.append([",".join(row[i]+row[i+1])])
print n
Note that any of these changes will avoid your program interruption by the out of scope error but you will print a lot of lines containing None because append returns None: https://docs.python.org/2/tutorial/datastructures.html

Here is a way to combine the line pairs:
import csv
from itertools import izip
def main():
with open('sample.csv', 'rb') as input_file:
reader = csv.reader(input_file)
for even_row, odd_row in izip(reader, reader):
combined_row = even_row + odd_row
print combined_row
if __name__ == '__main__':
main()

Related

How to transform a csv file into a multi-dimensional list using Python?

I started out with a 4d list, something like
tokens = [[[["a"], ["b"], ["c"]], [["d"]]], [[["e"], ["f"], ["g"]],[["h"], ["i"], ["j"], ["k"], ["l"]]]]
So I converted this to a csv file using the code
import csv
def export_to_csv(tokens):
csv_list = [["A", "B", "C", word]]
for h_index, h in enumerate(tokens):
for i_index, i in enumerate(h):
for j_index, j in enumerate(i):
csv_list.append([h_index, i_index, j_index, j])
with open('TEST.csv', 'w') as f:
# using csv.writer method from CSV package
write = csv.writer(f)
write.writerows(csv_list)
But now I want to do the reverse process, want to convert a csv file obtained in this format, back to the list format mentioned above.
Assuming you wanted your csv file to look something like this (there were a couple typos in the posted code):
A,B,C,word
0,0,0,a
0,0,1,b
0,0,2,c
...
here's one solution:
import csv
def import_from_csv(filename):
retval = []
with open(filename) as fh:
reader = csv.reader(fh)
# discard header row
next(reader)
# process data rows
for (x,y,z,word) in reader:
x = int(x)
y = int(y)
z = int(z)
retval.extend([[[]]] * (x + 1 - len(retval)))
retval[x].extend([[]] * (y + 1 - len(retval[x])))
retval[x][y].extend([0] * (z + 1 - len(retval[x][y])))
retval[x][y][z] = [word]
return retval
def import_from_csv(file):
import ast
import csv
data = []
# Read the CSV file
with open(file) as fp:
reader = csv.reader(fp)
# Skip the first line, which contains the headers
next(reader)
for line in reader:
# Read the first 3 elements of the line
a, b, c = [int(i) for i in line[:3]]
# When we read it back, everything comes in as strings. Use
# `literal_eval` to convert it to a Python list
value = ast.literal_eval(line[3])
# Extend the list to accomodate the new element
data.append([[[]]]) if len(data) < a + 1 else None
data[a].append([[]]) if len(data[a]) < b + 1 else None
data[a][b].append([]) if len(data[a][b]) < c + 1 else None
data[a][b][c] = value
return data
# Test
assert import_from_csv("TEST.csv") == tokens
First, I'd make writing this construction in a CSV format independent from dimensions:
import csv
def deep_iter(seq):
for i, val in enumerate(seq):
if type(val) is list:
for others in deep_iter(val):
yield i, *others
else:
yield i, val
with open('TEST.csv', 'w') as f:
csv.writer(f).writerows(deep_iter(tokens))
Next, we can use the lexicographic order of the indices to recreate the structure. All we have to do is sequentially move deeper into the output list according to the indices of a word. We stop at the penultimate index to get the last list, because the last index is pointing only at the place of the word in this list and doesn't matter due to the natural ordering:
with open('TEST.csv', 'r') as f:
rows = [*csv.reader(f)]
res = []
for r in rows:
index = r[:-2] # skip the last index and word
e = res
while index:
i = int(index.pop(0)) # get next part of a current index
if i < len(e):
e = e[i]
else:
e.append([]) # add new record at this level
e = e[-1]
e.append(r[-1]) # append the word to the corresponding list

Iterating through dictionary, 5 rows at a time

I am trying to open a csv file with csv.DictReader, read in just the first 5 rows of data, perform the primary process of my script, then read in the next 5 rows and do the same for them. Rinse and repeat.
I believe I have a method that works, however I am having issues with the last lines of the data not processing. I know I need to modify my if statement so that it also checks for if I'm at the end of the file, but am having trouble finding a way to do that. I've found methods online, but they involve reading in the whole file to get a row count but doing so would defeat the purpose of this script as I'm dealing with memory issues.
Here is what I have so far:
import csv
count = 0
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for row in reader:
count +=1
data.append(row)
if count % 5 == 0 or #something to check for the end of the file:
#do stuff
data = []
Thank you for the help!
You can use the chunksize argument when reading in the csv. This will step by step read in the number of lines:
import pandas as pd
reader = pd.read_csv('test.csv', chunksize=5)
for df in reader:
# do stuff
You can handle the remaining lines after the for loop body. You can also use the more pythonic enumerate.
import csv
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for count, row in enumerate(reader, 1):
data.append(row)
if count % 5 == 0:
# do stuff
data = []
print('handling remaining lines at end of file')
print(data)
considering the file
a,b
1,1
2,2
3,3
4,4
5,5
6,6
7,7
outputs
handling remaining lines at end of file
[OrderedDict([('a', '6'), ('b', '6')]), OrderedDict([('a', '7'), ('b', '7')])]
This is one approach using the iterator
Ex:
import csv
with open('test.csv') as file:
reader = csv.DictReader(file)
value = True
while value:
data = []
for _ in range(5): # Get 5 rows
value = next(reader, False)
if value:
data.append(value)
print(data) #List of 5 elements
Staying along the lines of what you wrote and not including any other imports:
import csv
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for row in reader:
data.append(row)
if len(data) > 5:
del data[0]
if len(data) == 5:
# Do something with the 5 elements
print(data)
The if statements allow the array to be loaded with 5 elements before processing on the begins.
class ZeroItterNumberException(Exception):
pass
class ItterN:
def __init__(self, itterator, n):
if n<1:
raise ZeroItterNumberException("{} is not a valid number of rows.".format(n))
self.itterator = itterator
self.n = n
self.cache = []
def __iter__(self):
return self
def __next__(self):
self.cache.append(next(self.itterator))
if len(self.cache) < self.n:
return self.__next__()
if len(self.cache) > self.n:
del self.cache[0]
if len(self.cache) == 5:
return self.cache

About CS50 Pset6 DNA, it overcounts STR for large.cvs

I'm working on pset6, DNA problem. This code is working for small.cvs but when I try the large one it overestimates the STR count. I guess the problem is when it tries to compare strings. But still don't know how to fix it. I checked that the counting is correct for the "TTTTTTCT" sequence but for the remaining STRs, the counting is in all cases larger than it should.
import sys
import csv
def main():
while (len(sys.argv) != 3):
print ("ERROR. Usage: python dna.py data.csv sequence.txt")
break
list_str = {}
#load the STRs to analyse
with open(sys.argv[1]) as csvfile:
readcsv = csv.reader (csvfile)
ncol = len(next(readcsv))
csvfile.seek(0)
header = list()
for line in readcsv:
a = sum(1 for line in readcsv)
for i in range(ncol):
list_str[line[i]] = 0
header.insert (i, line [i])
print (f"{header[i]}")
#open an work with the sequence file
sequence = open(sys.argv[2], 'r')
seq_r = sequence.read()
for k in list_str.keys():
#print (f"keu {k}")
p = 0
seq = len(seq_r)
while p < seq:
if seq_r[p:(p + len(k))] == k:
list_str[k] += 1
p += len(k)
else: p += 1
#print (f" sequenci encontrada{list_str[k]} y {k}")
print (f"nro de {k} {list_str[k]}")
with open(sys.argv[1]) as csvfile:
readcsv = csv.reader (csvfile)
next(csvfile)
find = False
for row in readcsv:
for j in range(1,ncol):
#print(f"header :{header[j]}")
if int(row [j]) == int(list_str[header[j]]):
print (f"row {row[j]} list {list_str[header[j]]}")
find = True
else:
find = False
break
if find == True: print (f"{row [0]}")
main()
The same thing happened to me, and then I saw the specifications of the pset.
We need to find the "longest run of consecutive repeats of the STR". Not the total count of STRs. It works for the small.csv as in my case too, so try to search for the longest consecutive occurrences of the specific STR.

Find and write to next blank cell in a column

I need to find and write to next blank cell.
import csv
with open(r'C:\\filepath\file.txt', 'r') as input_file:
reader = csv.reader(input_file)
with open (r'C:\filepath\file.csv', 'a', newline = '') as output_file:
writer = csv.writer(output_file)
for row in reader:
content = [i.split('~') for i in row]
for row1 in content:
con = [len(k.split('*')) for k in row1]
conn = [m.split('*') for m in row1]
for b in conn:
if con[0] > 4:
if (b[0] == 'NM1' and b[1] == '82' and b[2] == '1' ):
writer.writerow([b[3]] + [b[4]])
print ( b[3] + b[4] )
elif (b[0] == 'GS' ):
writer.writerow(['','','',b[2]])
print(b[2])
Seeking to get the output as shown in pic above. Right now in the first row only 'App1' is printing then in second row the names etc. Input File I am using as below. :
ISA*16* 00 0*T*>~
GS*IN*APP1*0999~
HPT*1*2~ SE*21*0001~
GE*1*145~
NM1*82*1*Tiger1a*Test1*K****~
NM1*82*1*Lion1a*Test2*K****~
NM1*82*1*Elephant1a*Test3*K****~
ISA*16* 00 0*T*>~
GS*IN*APP2*0999~
HPT*1*2~ SE*21*0001~
GE*1*145~
NM1*82*1*Tiger1a*Test4*K****~
ISA*16* 00 0*T*>~
GS*IN*APP1*0999~
HPT*1*2~
SE*21*0001~
GE*1*145~
NM1*82*1*Tiger1a*Test4*K****~
NM1*82*1*Lion1a*Test5*K****~
NM1*82*1*Elephant1a*Test6*K****~
ISA*16* 00 0*T*>~
GS*IN*APP10999~
HPT*1*2~
SE*21*0001~
GE*1*145~
NM1*82*1*Tiger1a*Test7*K****~
[![enter image description here][2]][2]
Ok, I assume that you have an input file where '~' is a record separator and '*' is a field separator. As the csv module only deals with lines I would first use a generator to split the input file on ~.
Then I would feed 2 lists, one with records starting with NM1*82*1 and containing a list of the 2 following fields, one with records starting with GS containing one single field.
Finally I would add each line of the second list to the corresponding line in the first one.
Code could be:
def splitter(fd, sep):
"""Splits fd (assumed to be an input file object) on sep ignoring end of lines"""
last = ""
for line in fd:
lines = line.strip().split(sep)
lines[0] = last + lines[0]
last = lines.pop()
for l in lines:
yield(l.strip())
if last != "":
yield last.strip()
return
with open(r'C:\\filepath\file.txt', 'r') as input_file, \
open (r'C:\filepath\file.csv', 'a', newline = '') as output_file:
rd = csv.reader(splitter(input_file, '~'), delimiter='*')
wr = csv.writer(output_file)
ls1 = []
ls2 = []
for b in rd:
if b[0] == 'NM1' and b[1] == '82' and b[2] == '1':
ls1.append([b[3], b[4]])
elif b[0] == 'GS':
ls2.append(b[2])
for i, b in enumerate(ls2):
ls1[i].append(b)
wr.writerows(ls1)
I obtain:
Tiger1a,Test1,APP1
Lion1a,Test2,APP2
Elephant1a,Test3,APP1
Tiger1a,Test4,APP10999
Tiger1a,Test4
Lion1a,Test5
Elephant1a,Test6
Tiger1a,Test7
Try reading the files into separate dictionary with lines numbers as keys. You can then iterate through both the dictionaries at the same time using zip function.
def zip(*iterables):
# zip('ABCD', 'xy') --> Ax By
sentinel = object()
iterators = [iter(it) for it in iterables]
while iterators:
result = []
for it in iterators:
elem = next(it, sentinel)
if elem is sentinel:
return
result.append(elem)
yield tuple(result)
More info here: Python3 zip function

Loop with numbers in rows of the csv file

I wrote a python script to run a sql in PostgreSQL,
import sys, os, math
os.chdir(r'C:\Users\Heinz\Desktop')
print os.getcwd()
#set up psycopg2 environment
import psycopg2
#shortest_path module
query = """
select *
from shortest_path ($$
select
gid as id,
source::int4 as source,
target::int4 as target,
cost::double precision as cost,
rcost::double precision as reverse_cost
from network
$$, %s, %s, %s, %s
)
"""
#make connection between python and postgresql
conn = psycopg2.connect("dbname = 'test' user = 'postgres' host = 'localhost' password = 'xxxx'")
cur = conn.cursor()
#count rows in the table
cur.execute("select count(*) from network")
result = cur.fetchone()
k = result[0] + 1 #number of points = number of segments + 1
#run loops
#import csv module
import csv
import tempfile
element = []
i = 1
l = 1
filename = 'pi_value.csv'
with open(filename, 'wb') as f:
while i <= k:
while l <= k:
cur.execute(query, (i, l, True, True))
element = cur.fetchall()
product = sum([a[-1] for a in element[:-1]])
writer = csv.writer(f, delimiter = ',')
writer.writerow([product])
element = []
l = l + 1
l = 1
i = i + 1
You can see that I used iterators from i to k(and l to k) to do the while loop, now I got a csv file containing numbers I want the iterator i and l to be. For example, here's the csv file,
I want the iterator to loop through using the number in every row starting from the first one, like in the innermost while loop, l = 6, l = 31, l = 28,..., l = 17, i is starting from 6 too, but only moves to i = 31 as l moves to 17 and back to l = 6, and so on.
How to write additional lines to read this csv file and let the iterator in the while loop to run loops as those numbers in the file?
Update#1
I tried this,
element = []
with open('tc_sta_id.csv') as f1, open('pi_value.csv', 'wb') as f2:
csvs = csv.reader(f1)
col_num = 0
rows = list(csvs)
k = len(rows)
for row in csvs:
i = row[col_num]
l = row[col_num]
while i <= k:
while l <= k:
cur.execute(query, (i, l, True, True))
element = cur.fetchall()
product = sum([a[-1] for a in element[:-1]])
writer = csv.writer(f2, delimiter = ',')
writer.writerow([product])
element = []
l = l + 1
l = row[col_num]
i = i + 1
the script runs fine, but there are all blank in the output csv file, please give me suggestions to fix this!
Since your question has changed quite a bit since the start, I'm just adding this as a seperate answer. So this is an answer specifically to your update 1.
The condition for your while loop is wrong. Your condition is based on the number of rows in your csv (8 in your example). You compare this with the numbers found in the csv (so 6, 31, ...). This means your while loops stop every time you hit the second number (31 > 8). Moreover you're not jumping to the next element of your csv, but you just add 1. I haven't tried to run your code, but I think your looping over: i=6,7,8 with l=6,7,8 for each value of i. Then it tries with 31, stops immediately as it does with the rest (they're all over 8).
I'm not entirely sure what you want as you seem to keep wanting to use extra while loops for something, and I'm not sure what you want to use them for (can't find it in your question, everything in your question implies for loops only).
I'm also not sure whether i and l come from the same csv or not. I made you a solution where you can easily make i and l come from different csvs, but I set them at the beginning to come from the same one. If they come from the same csv, you cannot just nest the for loops with the same iterator, so we cheat and extract them into a list (I tested this with a simple example).
rows = list(csvs) #convert to a list to avoid problems with iterating over the same iterator
csv_for_i = rows
csv_for_l = rows
for row_i in csv_for_i:
i = row_i[col_num]
for row_l in csv_for_l:
l = row_l[col_num]
cur.execute(query, (i, l, True, True))
element = cur.fetchall()
product = sum([a[-1] for a in element[:-1]])
writer = csv.writer(f2, delimiter = ',')
writer.writerow([product])
element = []
Let me know if this works. If so, accept the answer and I'll think about how to make the question and the answers into something that works more nicely on stack overflow. Currently, there are actually multiple questions and answers here and that's confusing for other people searching for answers.
Just for info, a small example on pitfalls with iterators (made with csv, but it goes for all iterators).
import csv
# test.csv contents:
#
#6
#31
#17
print 'Case 1:'
with open('test.csv') as f1:
csv1 = csv.reader(f1)
csv2 = csv.reader(f1)
for el1 in csv1:
for el2 in csv2:
print el1, el2
# Results
#
#['6'] ['31']
#['6'] ['17']
print 'Case 2:'
with open('test.csv') as f1:
csvs = csv.reader(f1)
rows = list(csvs)
for el1 in rows:
for el2 in rows:
print el1, el2
# Results
#
#['6'] ['6']
#['6'] ['31']
#['6'] ['17']
#['31'] ['6']
#['31'] ['31']
#['31'] ['17']
#['17'] ['6']
#['17'] ['31']
#['17'] ['17']
print 'Case 3:'
with open('test.csv') as f1, open('test.csv') as f2:
for el1 in csv.reader(f1):
for el2 in csv.reader(f2):
print el1, el2
# Results
#
#['6'] ['6']
#['6'] ['31']
#['6'] ['17']
print 'Case 4:'
with open('test.csv') as f1, open('test.csv') as f2:
csv1 = csv.reader(f1)
csv2 = csv.reader(f2)
for el1 in csv1:
for el2 in csv2:
print el1, el2
# Results
#
#['6'] ['6']
#['6'] ['31']
#['6'] ['17']
col_num is the column number in which you have your i values
with open('yourfile') as file:
csv = csv.reader(file)
next(csv) # skip the header
col_num = 0
for row in csv:
i = row[col_num]
while i <= k:
cur.execute(query, (i, 100000000000, True, True))
rs.append(cur.fetchall())
i = i + 1
I made you a short test using just simple python functionality.
f = open('test.csv')
csvlines = f.readlines()
f.close()
numbers = [int(n.split(',')[0]) for n in csvlines]
You might have to replace ',' with ';' or something else depending on the locale settings of your operating system.
Short explanation:
csvlines will contain the rows of your csv as strings, f.e. ['1,a,some text', '2,b,some other text']. You will go through each of those lines and call split on the line, e.g. '1,a,some text'.split(',') will give ['1','a','some text']. Your first column will then need to be cast to an integer because it currently is still a string.
Use in your code as (edited as question was edited):
for i in numbers:
if(i<k):
for l in numbers:
# not sure what your constraint on k is, but you can stop iterating
# through the numbers with a simple if
if(l<k):
#do work (you can use i an l here, they will automatically
# take the next value each iteration of the for loop
#(try print i, l for example): 6,6; 6,31; ...; 6,17; 31,6; 31,31

Categories