I am writing a function to a CSV file (which is working), however it is cutting off halfway on one of the final rows. I know it is probably something to do with the closing of the file, but I thought I did it correctly.
Any suggestions where it may be going wrong?
from itertools import combinations as cb
import csv
import numpy as np
with open("usableReviewScores.csv") as f:
reader=csv.reader(f)
next(reader, None) # skip header
data=[filter(None,i) for i in reader]
writer = csv.writer(open("alexData1.csv", 'wb'))
def avgg(x):
ll=[float(i) for i in x[1:]] #take review no and convert to float
n=len(ll)
avg_list=[x[0]] #start result list with ref no.
final_list=[]
a = 0
b = []
c = []
d = []
global min_val
global max_val
min_val = 0
max_val = 0
for i in range(4,5):
for j in cb(ll,i):
# print(j)
c = i
avg_list.append(sum(j)/i)
final_list.append(sum(j)/i)
a = sum(final_list)/len(final_list)
min_val = min(final_list)
max_val = max(final_list)
d = np.std(final_list)
return (avg_list, "avg", a, "min", min_val, "max", max_val,
"Num of reviews", c, "std", d, "Total Reviews", n)
for x in data:
print(avgg(x))
for x in data:
writer.writerow(avgg(x))
You say that it's probably to do with the closing of the file. Well you don't actually close your output file at all. So I'm guessing that this is a symptom of file-system caching and the cache not being properly flushed because the file isn't closed
You should use with open(filename) as handle: for the writing as well as for your input:
with open("alexData1.csv", 'wb') as outfile:
writer = csv.writer(outfile)
for x in data:
writer.writerow(avgg(x))
Related
I started out with a 4d list, something like
tokens = [[[["a"], ["b"], ["c"]], [["d"]]], [[["e"], ["f"], ["g"]],[["h"], ["i"], ["j"], ["k"], ["l"]]]]
So I converted this to a csv file using the code
import csv
def export_to_csv(tokens):
csv_list = [["A", "B", "C", word]]
for h_index, h in enumerate(tokens):
for i_index, i in enumerate(h):
for j_index, j in enumerate(i):
csv_list.append([h_index, i_index, j_index, j])
with open('TEST.csv', 'w') as f:
# using csv.writer method from CSV package
write = csv.writer(f)
write.writerows(csv_list)
But now I want to do the reverse process, want to convert a csv file obtained in this format, back to the list format mentioned above.
Assuming you wanted your csv file to look something like this (there were a couple typos in the posted code):
A,B,C,word
0,0,0,a
0,0,1,b
0,0,2,c
...
here's one solution:
import csv
def import_from_csv(filename):
retval = []
with open(filename) as fh:
reader = csv.reader(fh)
# discard header row
next(reader)
# process data rows
for (x,y,z,word) in reader:
x = int(x)
y = int(y)
z = int(z)
retval.extend([[[]]] * (x + 1 - len(retval)))
retval[x].extend([[]] * (y + 1 - len(retval[x])))
retval[x][y].extend([0] * (z + 1 - len(retval[x][y])))
retval[x][y][z] = [word]
return retval
def import_from_csv(file):
import ast
import csv
data = []
# Read the CSV file
with open(file) as fp:
reader = csv.reader(fp)
# Skip the first line, which contains the headers
next(reader)
for line in reader:
# Read the first 3 elements of the line
a, b, c = [int(i) for i in line[:3]]
# When we read it back, everything comes in as strings. Use
# `literal_eval` to convert it to a Python list
value = ast.literal_eval(line[3])
# Extend the list to accomodate the new element
data.append([[[]]]) if len(data) < a + 1 else None
data[a].append([[]]) if len(data[a]) < b + 1 else None
data[a][b].append([]) if len(data[a][b]) < c + 1 else None
data[a][b][c] = value
return data
# Test
assert import_from_csv("TEST.csv") == tokens
First, I'd make writing this construction in a CSV format independent from dimensions:
import csv
def deep_iter(seq):
for i, val in enumerate(seq):
if type(val) is list:
for others in deep_iter(val):
yield i, *others
else:
yield i, val
with open('TEST.csv', 'w') as f:
csv.writer(f).writerows(deep_iter(tokens))
Next, we can use the lexicographic order of the indices to recreate the structure. All we have to do is sequentially move deeper into the output list according to the indices of a word. We stop at the penultimate index to get the last list, because the last index is pointing only at the place of the word in this list and doesn't matter due to the natural ordering:
with open('TEST.csv', 'r') as f:
rows = [*csv.reader(f)]
res = []
for r in rows:
index = r[:-2] # skip the last index and word
e = res
while index:
i = int(index.pop(0)) # get next part of a current index
if i < len(e):
e = e[i]
else:
e.append([]) # add new record at this level
e = e[-1]
e.append(r[-1]) # append the word to the corresponding list
import csv
f = open("savewl_ssj500k22_Minfreq1-lowercaseWords_1.csv", "r")
csvF = csv.reader(f, delimiter="\t")
s = 0
sez = []
sezB = []
for q in f:
s = s + 1
if s > 3:
l = q.split(",")
x = l[1]
y = l[0]
sezB.append(y)
sezB.append(int(x))
sez.append(sezB)
print(sez)
f.close()
How to get it work to get all rows from .csv in list or sez saved
from this code I get: MemoryError
in file is 77214 lines of something like this : je,17031
Every loop you are appending sezB which is growing by itself.
so you are apparently grows by O(number of lines ^2).
This is something like this pattern (just for the explanation):
[[1,2], [1,2,3,4], [1,2,3,4,5,6], .....]
I guess you wanted to reset sezB to [] every loop.
Your code can be simplified to
import csv
s = 0
sez = []
sezB = []
with open("savewl_ssj500k22_Minfreq1-lowercaseWords_1.csv", "r") as f:
csvF = csv.reader(f, delimiter="\t")
for q in f:
s += 1
if s > 3:
l = q.split(",")
x, y = l[:2]
sezB.extend([x, int(y)])
sez.append(sezB)
print(sez)
As you can see, you constantly add 2 more element to the sezB list, which is not that much, but you also keep adding the resulting sezB list to the sez list.
So since the file has 77214 lines, sez will need to hold about 6 trillion (5962079010) strings, which is way too many to be stored into memory...
Hi everyone, I received data in a excel (xls) spreadsheet that is formatted in the first table, illustrated above.
I am attempting to rearrange this data into the format, in the table, just below. Any help would be greatly appreciated.
Thanks much.
First, save it to a .csv file
import csv
curr = []
with open('file.csv') as infile, open('path/to/output', 'w') as fout:
outfile = csv.writer(fout)
for area, pop10, pop20, pop50 in csv.reader(infile):
if curr and curr[0] != area:
outfile.writerow(curr)
curr = [area, pop10, pop20, pop50]
continue
if pop10: curr[1] = pop10
if pop20: curr[2] = pop20
if pop50: curr[3] = pop50
You can do this pretty succinctly using Pandas:
import pandas as pd
dataframe = pd.read_excel("in.xlsx")
merged = dataframe.groupby("AREA").sum()
merged.to_excel("out.xlsx")
so, if the csv has 11 columns where 'AREA' is the second column, would the code be:
def CompressRow(in_csv,out_file):
curr = []
with open(in_csv) as infile, open(out_file, 'w') as fout:
outfile = csv.writer(fout)
for a,b,c,d,e,f,g,h,i,j,k in csv.reader(infile):
if curr and curr[1] != b:
outfile.writerow(curr)
curr = [a,b,c,d,e,f,g,h,i,j,k]
continue
if a: curr[0] = a
if c: curr[2] = c
if d: curr[3] = d
if e: curr[4] = e
if f: curr[5] =f
if g: curr[6]=g
if h: curr[7]=h
if i: curr[8]=i
if j: curr[9]=j
if k: curr[10]=k
#execute CompressRow(in_csv,out_file)
I tried executing it and it gives me
if a: curr[0]=a
IndexError: list assignment index out of range
I have a 10GB (can't fit in RAM) file of the format:
Col1,Col2,Col3,Col4
1,2,3,4
34,256,348,
12,,3,4
So we have columns and missing values and I want to calculate the means of columns 2 and 3. With plain python I would do something like:
def means(rng):
s, e = rng
with open("data.csv") as fd:
title = next(fd)
titles = title.split(',')
print "Means for", ",".join(titles[s:e])
ret = [0] * (e-s)
for c, l in enumerate(fd):
vals = l.split(",")[s:e]
for i, v in enumerate(vals):
try:
ret[i] += int(v)
except ValueError:
pass
return map(lambda s: float(s) / (c + 1), ret)
But I suspect there is a much faster way to do thins with numpy (I am still a novice at it).
Pandas is your best friend:
from pandas.io.parsers import read_csv
from numpy import sum
# Load 10000 elements at a time, you can play with this number to get better
# performance on your machine
my_data = read_csv("data.csv", chunksize=10000)
total = 0
count = 0
for chunk in my_data:
# If you want to exclude NAs from the average, remove the next line
chunk = chunk.fillna(0.0)
total += chunk.sum(skipna=True)
count += chunk.count()
avg = total / count
col1_avg = avg["Col1"]
# ... etc. ...
Try:
import numpy
# read from csv into record array
df = numpy.genfromtxt('test.csv',delimiter=',', usecols=(1,2), skip_header=1, usemask=True)
# calc means on columns
ans = numpy.mean(dat, axis=0)
ans.data will contain an array of all the means for the columns.
EDITS for Updated Question
If you have a 10G file you can chunk it with numpy as well. See this answer.
Something like this:
sums = numpy.array((0,0))
counts = numpy.array((0,0))
fH = open('test.csv')
fH.readline() # skip header
while True:
try:
df = numpy.genfromtxt(itertools.islice(fH, 1000), delimiter=',', usecols=(1,2), usemask=True)
except StopIteration:
break
sums = sums + numpy.sum(df, 0)
counts = counts + numpy.sum(df.mask == False, 0)
fH.close()
means = sums / counts
I wrote a python script to run a sql in PostgreSQL,
import sys, os, math
os.chdir(r'C:\Users\Heinz\Desktop')
print os.getcwd()
#set up psycopg2 environment
import psycopg2
#shortest_path module
query = """
select *
from shortest_path ($$
select
gid as id,
source::int4 as source,
target::int4 as target,
cost::double precision as cost,
rcost::double precision as reverse_cost
from network
$$, %s, %s, %s, %s
)
"""
#make connection between python and postgresql
conn = psycopg2.connect("dbname = 'test' user = 'postgres' host = 'localhost' password = 'xxxx'")
cur = conn.cursor()
#count rows in the table
cur.execute("select count(*) from network")
result = cur.fetchone()
k = result[0] + 1 #number of points = number of segments + 1
#run loops
#import csv module
import csv
import tempfile
element = []
i = 1
l = 1
filename = 'pi_value.csv'
with open(filename, 'wb') as f:
while i <= k:
while l <= k:
cur.execute(query, (i, l, True, True))
element = cur.fetchall()
product = sum([a[-1] for a in element[:-1]])
writer = csv.writer(f, delimiter = ',')
writer.writerow([product])
element = []
l = l + 1
l = 1
i = i + 1
You can see that I used iterators from i to k(and l to k) to do the while loop, now I got a csv file containing numbers I want the iterator i and l to be. For example, here's the csv file,
I want the iterator to loop through using the number in every row starting from the first one, like in the innermost while loop, l = 6, l = 31, l = 28,..., l = 17, i is starting from 6 too, but only moves to i = 31 as l moves to 17 and back to l = 6, and so on.
How to write additional lines to read this csv file and let the iterator in the while loop to run loops as those numbers in the file?
Update#1
I tried this,
element = []
with open('tc_sta_id.csv') as f1, open('pi_value.csv', 'wb') as f2:
csvs = csv.reader(f1)
col_num = 0
rows = list(csvs)
k = len(rows)
for row in csvs:
i = row[col_num]
l = row[col_num]
while i <= k:
while l <= k:
cur.execute(query, (i, l, True, True))
element = cur.fetchall()
product = sum([a[-1] for a in element[:-1]])
writer = csv.writer(f2, delimiter = ',')
writer.writerow([product])
element = []
l = l + 1
l = row[col_num]
i = i + 1
the script runs fine, but there are all blank in the output csv file, please give me suggestions to fix this!
Since your question has changed quite a bit since the start, I'm just adding this as a seperate answer. So this is an answer specifically to your update 1.
The condition for your while loop is wrong. Your condition is based on the number of rows in your csv (8 in your example). You compare this with the numbers found in the csv (so 6, 31, ...). This means your while loops stop every time you hit the second number (31 > 8). Moreover you're not jumping to the next element of your csv, but you just add 1. I haven't tried to run your code, but I think your looping over: i=6,7,8 with l=6,7,8 for each value of i. Then it tries with 31, stops immediately as it does with the rest (they're all over 8).
I'm not entirely sure what you want as you seem to keep wanting to use extra while loops for something, and I'm not sure what you want to use them for (can't find it in your question, everything in your question implies for loops only).
I'm also not sure whether i and l come from the same csv or not. I made you a solution where you can easily make i and l come from different csvs, but I set them at the beginning to come from the same one. If they come from the same csv, you cannot just nest the for loops with the same iterator, so we cheat and extract them into a list (I tested this with a simple example).
rows = list(csvs) #convert to a list to avoid problems with iterating over the same iterator
csv_for_i = rows
csv_for_l = rows
for row_i in csv_for_i:
i = row_i[col_num]
for row_l in csv_for_l:
l = row_l[col_num]
cur.execute(query, (i, l, True, True))
element = cur.fetchall()
product = sum([a[-1] for a in element[:-1]])
writer = csv.writer(f2, delimiter = ',')
writer.writerow([product])
element = []
Let me know if this works. If so, accept the answer and I'll think about how to make the question and the answers into something that works more nicely on stack overflow. Currently, there are actually multiple questions and answers here and that's confusing for other people searching for answers.
Just for info, a small example on pitfalls with iterators (made with csv, but it goes for all iterators).
import csv
# test.csv contents:
#
#6
#31
#17
print 'Case 1:'
with open('test.csv') as f1:
csv1 = csv.reader(f1)
csv2 = csv.reader(f1)
for el1 in csv1:
for el2 in csv2:
print el1, el2
# Results
#
#['6'] ['31']
#['6'] ['17']
print 'Case 2:'
with open('test.csv') as f1:
csvs = csv.reader(f1)
rows = list(csvs)
for el1 in rows:
for el2 in rows:
print el1, el2
# Results
#
#['6'] ['6']
#['6'] ['31']
#['6'] ['17']
#['31'] ['6']
#['31'] ['31']
#['31'] ['17']
#['17'] ['6']
#['17'] ['31']
#['17'] ['17']
print 'Case 3:'
with open('test.csv') as f1, open('test.csv') as f2:
for el1 in csv.reader(f1):
for el2 in csv.reader(f2):
print el1, el2
# Results
#
#['6'] ['6']
#['6'] ['31']
#['6'] ['17']
print 'Case 4:'
with open('test.csv') as f1, open('test.csv') as f2:
csv1 = csv.reader(f1)
csv2 = csv.reader(f2)
for el1 in csv1:
for el2 in csv2:
print el1, el2
# Results
#
#['6'] ['6']
#['6'] ['31']
#['6'] ['17']
col_num is the column number in which you have your i values
with open('yourfile') as file:
csv = csv.reader(file)
next(csv) # skip the header
col_num = 0
for row in csv:
i = row[col_num]
while i <= k:
cur.execute(query, (i, 100000000000, True, True))
rs.append(cur.fetchall())
i = i + 1
I made you a short test using just simple python functionality.
f = open('test.csv')
csvlines = f.readlines()
f.close()
numbers = [int(n.split(',')[0]) for n in csvlines]
You might have to replace ',' with ';' or something else depending on the locale settings of your operating system.
Short explanation:
csvlines will contain the rows of your csv as strings, f.e. ['1,a,some text', '2,b,some other text']. You will go through each of those lines and call split on the line, e.g. '1,a,some text'.split(',') will give ['1','a','some text']. Your first column will then need to be cast to an integer because it currently is still a string.
Use in your code as (edited as question was edited):
for i in numbers:
if(i<k):
for l in numbers:
# not sure what your constraint on k is, but you can stop iterating
# through the numbers with a simple if
if(l<k):
#do work (you can use i an l here, they will automatically
# take the next value each iteration of the for loop
#(try print i, l for example): 6,6; 6,31; ...; 6,17; 31,6; 31,31