Reformatting Excel spreadsheet in Python - python

Hi everyone, I received data in a excel (xls) spreadsheet that is formatted in the first table, illustrated above.
I am attempting to rearrange this data into the format, in the table, just below. Any help would be greatly appreciated.
Thanks much.

First, save it to a .csv file
import csv
curr = []
with open('file.csv') as infile, open('path/to/output', 'w') as fout:
outfile = csv.writer(fout)
for area, pop10, pop20, pop50 in csv.reader(infile):
if curr and curr[0] != area:
outfile.writerow(curr)
curr = [area, pop10, pop20, pop50]
continue
if pop10: curr[1] = pop10
if pop20: curr[2] = pop20
if pop50: curr[3] = pop50

You can do this pretty succinctly using Pandas:
import pandas as pd
dataframe = pd.read_excel("in.xlsx")
merged = dataframe.groupby("AREA").sum()
merged.to_excel("out.xlsx")

so, if the csv has 11 columns where 'AREA' is the second column, would the code be:
def CompressRow(in_csv,out_file):
curr = []
with open(in_csv) as infile, open(out_file, 'w') as fout:
outfile = csv.writer(fout)
for a,b,c,d,e,f,g,h,i,j,k in csv.reader(infile):
if curr and curr[1] != b:
outfile.writerow(curr)
curr = [a,b,c,d,e,f,g,h,i,j,k]
continue
if a: curr[0] = a
if c: curr[2] = c
if d: curr[3] = d
if e: curr[4] = e
if f: curr[5] =f
if g: curr[6]=g
if h: curr[7]=h
if i: curr[8]=i
if j: curr[9]=j
if k: curr[10]=k
#execute CompressRow(in_csv,out_file)
I tried executing it and it gives me
if a: curr[0]=a
IndexError: list assignment index out of range

Related

How to transform a csv file into a multi-dimensional list using Python?

I started out with a 4d list, something like
tokens = [[[["a"], ["b"], ["c"]], [["d"]]], [[["e"], ["f"], ["g"]],[["h"], ["i"], ["j"], ["k"], ["l"]]]]
So I converted this to a csv file using the code
import csv
def export_to_csv(tokens):
csv_list = [["A", "B", "C", word]]
for h_index, h in enumerate(tokens):
for i_index, i in enumerate(h):
for j_index, j in enumerate(i):
csv_list.append([h_index, i_index, j_index, j])
with open('TEST.csv', 'w') as f:
# using csv.writer method from CSV package
write = csv.writer(f)
write.writerows(csv_list)
But now I want to do the reverse process, want to convert a csv file obtained in this format, back to the list format mentioned above.
Assuming you wanted your csv file to look something like this (there were a couple typos in the posted code):
A,B,C,word
0,0,0,a
0,0,1,b
0,0,2,c
...
here's one solution:
import csv
def import_from_csv(filename):
retval = []
with open(filename) as fh:
reader = csv.reader(fh)
# discard header row
next(reader)
# process data rows
for (x,y,z,word) in reader:
x = int(x)
y = int(y)
z = int(z)
retval.extend([[[]]] * (x + 1 - len(retval)))
retval[x].extend([[]] * (y + 1 - len(retval[x])))
retval[x][y].extend([0] * (z + 1 - len(retval[x][y])))
retval[x][y][z] = [word]
return retval
def import_from_csv(file):
import ast
import csv
data = []
# Read the CSV file
with open(file) as fp:
reader = csv.reader(fp)
# Skip the first line, which contains the headers
next(reader)
for line in reader:
# Read the first 3 elements of the line
a, b, c = [int(i) for i in line[:3]]
# When we read it back, everything comes in as strings. Use
# `literal_eval` to convert it to a Python list
value = ast.literal_eval(line[3])
# Extend the list to accomodate the new element
data.append([[[]]]) if len(data) < a + 1 else None
data[a].append([[]]) if len(data[a]) < b + 1 else None
data[a][b].append([]) if len(data[a][b]) < c + 1 else None
data[a][b][c] = value
return data
# Test
assert import_from_csv("TEST.csv") == tokens
First, I'd make writing this construction in a CSV format independent from dimensions:
import csv
def deep_iter(seq):
for i, val in enumerate(seq):
if type(val) is list:
for others in deep_iter(val):
yield i, *others
else:
yield i, val
with open('TEST.csv', 'w') as f:
csv.writer(f).writerows(deep_iter(tokens))
Next, we can use the lexicographic order of the indices to recreate the structure. All we have to do is sequentially move deeper into the output list according to the indices of a word. We stop at the penultimate index to get the last list, because the last index is pointing only at the place of the word in this list and doesn't matter due to the natural ordering:
with open('TEST.csv', 'r') as f:
rows = [*csv.reader(f)]
res = []
for r in rows:
index = r[:-2] # skip the last index and word
e = res
while index:
i = int(index.pop(0)) # get next part of a current index
if i < len(e):
e = e[i]
else:
e.append([]) # add new record at this level
e = e[-1]
e.append(r[-1]) # append the word to the corresponding list

Iterating through dictionary, 5 rows at a time

I am trying to open a csv file with csv.DictReader, read in just the first 5 rows of data, perform the primary process of my script, then read in the next 5 rows and do the same for them. Rinse and repeat.
I believe I have a method that works, however I am having issues with the last lines of the data not processing. I know I need to modify my if statement so that it also checks for if I'm at the end of the file, but am having trouble finding a way to do that. I've found methods online, but they involve reading in the whole file to get a row count but doing so would defeat the purpose of this script as I'm dealing with memory issues.
Here is what I have so far:
import csv
count = 0
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for row in reader:
count +=1
data.append(row)
if count % 5 == 0 or #something to check for the end of the file:
#do stuff
data = []
Thank you for the help!
You can use the chunksize argument when reading in the csv. This will step by step read in the number of lines:
import pandas as pd
reader = pd.read_csv('test.csv', chunksize=5)
for df in reader:
# do stuff
You can handle the remaining lines after the for loop body. You can also use the more pythonic enumerate.
import csv
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for count, row in enumerate(reader, 1):
data.append(row)
if count % 5 == 0:
# do stuff
data = []
print('handling remaining lines at end of file')
print(data)
considering the file
a,b
1,1
2,2
3,3
4,4
5,5
6,6
7,7
outputs
handling remaining lines at end of file
[OrderedDict([('a', '6'), ('b', '6')]), OrderedDict([('a', '7'), ('b', '7')])]
This is one approach using the iterator
Ex:
import csv
with open('test.csv') as file:
reader = csv.DictReader(file)
value = True
while value:
data = []
for _ in range(5): # Get 5 rows
value = next(reader, False)
if value:
data.append(value)
print(data) #List of 5 elements
Staying along the lines of what you wrote and not including any other imports:
import csv
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for row in reader:
data.append(row)
if len(data) > 5:
del data[0]
if len(data) == 5:
# Do something with the 5 elements
print(data)
The if statements allow the array to be loaded with 5 elements before processing on the begins.
class ZeroItterNumberException(Exception):
pass
class ItterN:
def __init__(self, itterator, n):
if n<1:
raise ZeroItterNumberException("{} is not a valid number of rows.".format(n))
self.itterator = itterator
self.n = n
self.cache = []
def __iter__(self):
return self
def __next__(self):
self.cache.append(next(self.itterator))
if len(self.cache) < self.n:
return self.__next__()
if len(self.cache) > self.n:
del self.cache[0]
if len(self.cache) == 5:
return self.cache

Find and write to next blank cell in a column

I need to find and write to next blank cell.
import csv
with open(r'C:\\filepath\file.txt', 'r') as input_file:
reader = csv.reader(input_file)
with open (r'C:\filepath\file.csv', 'a', newline = '') as output_file:
writer = csv.writer(output_file)
for row in reader:
content = [i.split('~') for i in row]
for row1 in content:
con = [len(k.split('*')) for k in row1]
conn = [m.split('*') for m in row1]
for b in conn:
if con[0] > 4:
if (b[0] == 'NM1' and b[1] == '82' and b[2] == '1' ):
writer.writerow([b[3]] + [b[4]])
print ( b[3] + b[4] )
elif (b[0] == 'GS' ):
writer.writerow(['','','',b[2]])
print(b[2])
Seeking to get the output as shown in pic above. Right now in the first row only 'App1' is printing then in second row the names etc. Input File I am using as below. :
ISA*16* 00 0*T*>~
GS*IN*APP1*0999~
HPT*1*2~ SE*21*0001~
GE*1*145~
NM1*82*1*Tiger1a*Test1*K****~
NM1*82*1*Lion1a*Test2*K****~
NM1*82*1*Elephant1a*Test3*K****~
ISA*16* 00 0*T*>~
GS*IN*APP2*0999~
HPT*1*2~ SE*21*0001~
GE*1*145~
NM1*82*1*Tiger1a*Test4*K****~
ISA*16* 00 0*T*>~
GS*IN*APP1*0999~
HPT*1*2~
SE*21*0001~
GE*1*145~
NM1*82*1*Tiger1a*Test4*K****~
NM1*82*1*Lion1a*Test5*K****~
NM1*82*1*Elephant1a*Test6*K****~
ISA*16* 00 0*T*>~
GS*IN*APP10999~
HPT*1*2~
SE*21*0001~
GE*1*145~
NM1*82*1*Tiger1a*Test7*K****~
[![enter image description here][2]][2]
Ok, I assume that you have an input file where '~' is a record separator and '*' is a field separator. As the csv module only deals with lines I would first use a generator to split the input file on ~.
Then I would feed 2 lists, one with records starting with NM1*82*1 and containing a list of the 2 following fields, one with records starting with GS containing one single field.
Finally I would add each line of the second list to the corresponding line in the first one.
Code could be:
def splitter(fd, sep):
"""Splits fd (assumed to be an input file object) on sep ignoring end of lines"""
last = ""
for line in fd:
lines = line.strip().split(sep)
lines[0] = last + lines[0]
last = lines.pop()
for l in lines:
yield(l.strip())
if last != "":
yield last.strip()
return
with open(r'C:\\filepath\file.txt', 'r') as input_file, \
open (r'C:\filepath\file.csv', 'a', newline = '') as output_file:
rd = csv.reader(splitter(input_file, '~'), delimiter='*')
wr = csv.writer(output_file)
ls1 = []
ls2 = []
for b in rd:
if b[0] == 'NM1' and b[1] == '82' and b[2] == '1':
ls1.append([b[3], b[4]])
elif b[0] == 'GS':
ls2.append(b[2])
for i, b in enumerate(ls2):
ls1[i].append(b)
wr.writerows(ls1)
I obtain:
Tiger1a,Test1,APP1
Lion1a,Test2,APP2
Elephant1a,Test3,APP1
Tiger1a,Test4,APP10999
Tiger1a,Test4
Lion1a,Test5
Elephant1a,Test6
Tiger1a,Test7
Try reading the files into separate dictionary with lines numbers as keys. You can then iterate through both the dictionaries at the same time using zip function.
def zip(*iterables):
# zip('ABCD', 'xy') --> Ax By
sentinel = object()
iterators = [iter(it) for it in iterables]
while iterators:
result = []
for it in iterators:
elem = next(it, sentinel)
if elem is sentinel:
return
result.append(elem)
yield tuple(result)
More info here: Python3 zip function

Python - CSV Writing - cutting off final rows

I am writing a function to a CSV file (which is working), however it is cutting off halfway on one of the final rows. I know it is probably something to do with the closing of the file, but I thought I did it correctly.
Any suggestions where it may be going wrong?
from itertools import combinations as cb
import csv
import numpy as np
with open("usableReviewScores.csv") as f:
reader=csv.reader(f)
next(reader, None) # skip header
data=[filter(None,i) for i in reader]
writer = csv.writer(open("alexData1.csv", 'wb'))
def avgg(x):
ll=[float(i) for i in x[1:]] #take review no and convert to float
n=len(ll)
avg_list=[x[0]] #start result list with ref no.
final_list=[]
a = 0
b = []
c = []
d = []
global min_val
global max_val
min_val = 0
max_val = 0
for i in range(4,5):
for j in cb(ll,i):
# print(j)
c = i
avg_list.append(sum(j)/i)
final_list.append(sum(j)/i)
a = sum(final_list)/len(final_list)
min_val = min(final_list)
max_val = max(final_list)
d = np.std(final_list)
return (avg_list, "avg", a, "min", min_val, "max", max_val,
"Num of reviews", c, "std", d, "Total Reviews", n)
for x in data:
print(avgg(x))
for x in data:
writer.writerow(avgg(x))
You say that it's probably to do with the closing of the file. Well you don't actually close your output file at all. So I'm guessing that this is a symptom of file-system caching and the cache not being properly flushed because the file isn't closed
You should use with open(filename) as handle: for the writing as well as for your input:
with open("alexData1.csv", 'wb') as outfile:
writer = csv.writer(outfile)
for x in data:
writer.writerow(avgg(x))

How to separate rows of a csv file

I'm having a problem in rewriting a CSV file. What I had was a CSV file with 20 columns and I rewrote it to only 5. Now, I need to take out a couple of unnecessary points, where SN < 20. It works, the only problem is that it doesn't separate the rows. It puts everything in row 1. I'm guessing that its from the,
output_ary.append(row)
but I don't know what else to write there. Here is a part of the code:
import csv
import os
import matplotlib.pyplot as plt
os.chdir("C:\Users\Robert\Documents\qwe")
r = csv.reader(open("gdweights_feh_robert_cmr.csv",'rU'))
w = csv.writer(open("gdweight.csv",'wb',buffering=0))
zerovar2 = 0
for row in r:
if zerovar2==0:
zerovar2 = zerovar2 + 1
else:
sn = float(row[11])
rweight = float(row[17])
tarweight = float(row[18])
fehadop = float(row[25])
weight = rweight*tarweight*fehadop
w.writerow([sn,rweight,tarweight,fehadop,weight])
output_ary = []
with open("gdweight.csv",'rU') as f:
reader = csv.reader(f, delimiter= ',')
zerovar = 0
for row in reader:
if zerovar==0:
zerovar = zerovar + 1
else:
sn = row [0]
zerovar = zerovar + 1
x = float(sn)
if x > 20:
output_ary.append(row)
with open("ouput1.csv",'w') as f2:
for row in output_ary:
for item in row:
f2.write(item + ",")
with open("ouput1.csv",'w') as f2:
for row in output_ary:
for item in row:
f2.write(item + ",")
f2.write("\n") # this is what you're missing
Simple rewrite the last as :
with open("ouput1.csv",'w') as f2:
for row in output_ary:
f2.write(",".join([str(e) for e in item] + '\n')
Now here are a couple of additional comments :
you can use enumerate instead of using a counter :
for i_row, row in enumerate(r) :
...
you can also use a csv writer :
with open("output.txt", "w") as f :
csv_w = csv.writer(f)
for i_row, row in enumerate(output) :
if i_row== 0 :
continue
csv_w.writerow(row)

Categories