calculating means from csv with python's numpy

calculating means from csv with python's numpy - python

I have a 10GB (can't fit in RAM) file of the format:
Col1,Col2,Col3,Col4
1,2,3,4
34,256,348,
12,,3,4
So we have columns and missing values and I want to calculate the means of columns 2 and 3. With plain python I would do something like:
def means(rng):
s, e = rng
with open("data.csv") as fd:
title = next(fd)
titles = title.split(',')
print "Means for", ",".join(titles[s:e])
ret = [0] * (e-s)
for c, l in enumerate(fd):
vals = l.split(",")[s:e]
for i, v in enumerate(vals):
try:
ret[i] += int(v)
except ValueError:
pass
return map(lambda s: float(s) / (c + 1), ret)
But I suspect there is a much faster way to do thins with numpy (I am still a novice at it).

Pandas is your best friend:
from pandas.io.parsers import read_csv
from numpy import sum
# Load 10000 elements at a time, you can play with this number to get better
# performance on your machine
my_data = read_csv("data.csv", chunksize=10000)
total = 0
count = 0
for chunk in my_data:
# If you want to exclude NAs from the average, remove the next line
chunk = chunk.fillna(0.0)
total += chunk.sum(skipna=True)
count += chunk.count()
avg = total / count
col1_avg = avg["Col1"]
# ... etc. ...

Try:
import numpy
# read from csv into record array
df = numpy.genfromtxt('test.csv',delimiter=',', usecols=(1,2), skip_header=1, usemask=True)
# calc means on columns
ans = numpy.mean(dat, axis=0)
ans.data will contain an array of all the means for the columns.
EDITS for Updated Question
If you have a 10G file you can chunk it with numpy as well. See this answer.
Something like this:
sums = numpy.array((0,0))
counts = numpy.array((0,0))
fH = open('test.csv')
fH.readline() # skip header
while True:
try:
df = numpy.genfromtxt(itertools.islice(fH, 1000), delimiter=',', usecols=(1,2), usemask=True)
except StopIteration:
break
sums = sums + numpy.sum(df, 0)
counts = counts + numpy.sum(df.mask == False, 0)
fH.close()
means = sums / counts

Related

How to transform a csv file into a multi-dimensional list using Python?

I started out with a 4d list, something like
tokens = [[[["a"], ["b"], ["c"]], [["d"]]], [[["e"], ["f"], ["g"]],[["h"], ["i"], ["j"], ["k"], ["l"]]]]
So I converted this to a csv file using the code
import csv
def export_to_csv(tokens):
csv_list = [["A", "B", "C", word]]
for h_index, h in enumerate(tokens):
for i_index, i in enumerate(h):
for j_index, j in enumerate(i):
csv_list.append([h_index, i_index, j_index, j])
with open('TEST.csv', 'w') as f:
# using csv.writer method from CSV package
write = csv.writer(f)
write.writerows(csv_list)
But now I want to do the reverse process, want to convert a csv file obtained in this format, back to the list format mentioned above.

Assuming you wanted your csv file to look something like this (there were a couple typos in the posted code):
A,B,C,word
0,0,0,a
0,0,1,b
0,0,2,c
...
here's one solution:
import csv
def import_from_csv(filename):
retval = []
with open(filename) as fh:
reader = csv.reader(fh)
# discard header row
next(reader)
# process data rows
for (x,y,z,word) in reader:
x = int(x)
y = int(y)
z = int(z)
retval.extend([[[]]] * (x + 1 - len(retval)))
retval[x].extend([[]] * (y + 1 - len(retval[x])))
retval[x][y].extend([0] * (z + 1 - len(retval[x][y])))
retval[x][y][z] = [word]
return retval

def import_from_csv(file):
import ast
import csv
data = []
# Read the CSV file
with open(file) as fp:
reader = csv.reader(fp)
# Skip the first line, which contains the headers
next(reader)
for line in reader:
# Read the first 3 elements of the line
a, b, c = [int(i) for i in line[:3]]
# When we read it back, everything comes in as strings. Use
# `literal_eval` to convert it to a Python list
value = ast.literal_eval(line[3])
# Extend the list to accomodate the new element
data.append([[[]]]) if len(data) < a + 1 else None
data[a].append([[]]) if len(data[a]) < b + 1 else None
data[a][b].append([]) if len(data[a][b]) < c + 1 else None
data[a][b][c] = value
return data
# Test
assert import_from_csv("TEST.csv") == tokens

First, I'd make writing this construction in a CSV format independent from dimensions:
import csv
def deep_iter(seq):
for i, val in enumerate(seq):
if type(val) is list:
for others in deep_iter(val):
yield i, *others
else:
yield i, val
with open('TEST.csv', 'w') as f:
csv.writer(f).writerows(deep_iter(tokens))
Next, we can use the lexicographic order of the indices to recreate the structure. All we have to do is sequentially move deeper into the output list according to the indices of a word. We stop at the penultimate index to get the last list, because the last index is pointing only at the place of the word in this list and doesn't matter due to the natural ordering:
with open('TEST.csv', 'r') as f:
rows = [*csv.reader(f)]
res = []
for r in rows:
index = r[:-2] # skip the last index and word
e = res
while index:
i = int(index.pop(0)) # get next part of a current index
if i < len(e):
e = e[i]
else:
e.append([]) # add new record at this level
e = e[-1]
e.append(r[-1]) # append the word to the corresponding list

Calculate some averages in .txt python

I have a .txt-file called ecc.txt. It contains more than 8000 lines of numbers. I want to count the average of every 360 lines in that file.
Here is the code:
import math
f = open(r'ecc.txt').read()
data = []
for line in data:
sum = 0
for i in range (len(data)):
if i % 360 != 0:
sum = sum + ecc[i]
else:
average = sum / 360
print(average)
sum = 0
When I am running it, nothing happens. I didn't get any results. The code just running and end without any result.
Is there something wrong with this code?
Thank you.

avg_dict = {}
with open('ecc.txt') as f:
data = f.read().split(' ')
sum = 0
i = 0
for str_number in data:
sum += int(str_number)
i += 1
if i % 360 == 0:
avg_dict[i] = sum/360
sum = 0
I've assumed that your file text has an empty space as separator. Otherwise, you can change the sep value in the split method. If there is not separator change data as:
data = list(f.read())

You code would work with some changes:
import math
data=[]
with open(r'ecc.txt') as f:
for i in f:
data.append(int(i))
for line in data:
sum = 0
for i in range (len(data)):
if i%360 !=0:
sum = sum + ecc[i]
else:
average = sum/360
print(average)
sum=0
Be aware though, that this code doesn't include values for each 360th element (i guess it's not a problem for an average) and also you don't have average for last elements

Python - CSV Writing - cutting off final rows

I am writing a function to a CSV file (which is working), however it is cutting off halfway on one of the final rows. I know it is probably something to do with the closing of the file, but I thought I did it correctly.
Any suggestions where it may be going wrong?
from itertools import combinations as cb
import csv
import numpy as np
with open("usableReviewScores.csv") as f:
reader=csv.reader(f)
next(reader, None) # skip header
data=[filter(None,i) for i in reader]
writer = csv.writer(open("alexData1.csv", 'wb'))
def avgg(x):
ll=[float(i) for i in x[1:]] #take review no and convert to float
n=len(ll)
avg_list=[x[0]] #start result list with ref no.
final_list=[]
a = 0
b = []
c = []
d = []
global min_val
global max_val
min_val = 0
max_val = 0
for i in range(4,5):
for j in cb(ll,i):
# print(j)
c = i
avg_list.append(sum(j)/i)
final_list.append(sum(j)/i)
a = sum(final_list)/len(final_list)
min_val = min(final_list)
max_val = max(final_list)
d = np.std(final_list)
return (avg_list, "avg", a, "min", min_val, "max", max_val,
"Num of reviews", c, "std", d, "Total Reviews", n)
for x in data:
print(avgg(x))
for x in data:
writer.writerow(avgg(x))

You say that it's probably to do with the closing of the file. Well you don't actually close your output file at all. So I'm guessing that this is a symptom of file-system caching and the cache not being properly flushed because the file isn't closed
You should use with open(filename) as handle: for the writing as well as for your input:
with open("alexData1.csv", 'wb') as outfile:
writer = csv.writer(outfile)
for x in data:
writer.writerow(avgg(x))

Reading large CSV files from nth line in Python (not from the beginning)

I have 3 huge CSV files containing climate data, each about 5GB.
The first cell in each line is the meteorological station's number (from 0 to about 100,000) each station contains from 1 to 800 lines in each file, which is not necessarily equal in all files. For example, Station 11 has 600, 500, and 200 lines in file1, file2, and file3 respectively.
I want to read all the lines of each station, do some operations on them, then write results to another file, then the next station, etc.
The files are too large to load at once in memory, so I tried some solutions to read them with minimal memory load, like this post and this post which include this method:
with open(...) as f:
for line in f:
<do something with line>
The problem with this method that it reads the file from the beginning every time, while I want to read files as follows:
for station in range (100798):
with open (file1) as f1, open (file2) as f2, open (file3) as f3:
for line in f1:
st = line.split(",")[0]
if st == station:
<store this line for some analysis>
else:
break # break the for loop and go to read the next file
for line in f2:
...
<similar code to f1>
...
for line in f3:
...
<similar code to f1>
...
<do the analysis to station, the go to next station>
The problem is that each time I start over to take next station, the for loop would start from the beginning, while I want it to start from where the 'Break' occurs at the nth line, i.e. to continue reading the file.
How can I do it?
Thanks in advance
Notes About the solutions below:
As I mentioned below at the time I posted my answer, I implemented the answer of #DerFaizio but I found it very slow in processing.
After I had tried the generator-based answer submitted by #PM_2Ring I found it very very fast. Maybe because it depends on Generators.
The difference between the two solutions can be noticed by the numbers of processed stations per minutes which are 2500 st/min for the generator based solution, and 45 st/min for the Pandas based solution. where the Generator based solution is >55 times faster.
I will keep both implementations below for reference.
Many thanks to all contributors, especially #PM_2Ring.

The code below iterates over the files line by line, grabbing the lines for each station from each file in turn and appending them to a list for further processing.
The heart of this code is a generator file_buff that yields the lines of a file but which allows us to push a line back for later reading. When we read a line for the next station we can send it back to file_buff so that we can re-read it when it's time to process the lines for that station.
To test this code, I created some simple fake station data using create_data.
from random import seed, randrange
seed(123)
station_hi = 5
def create_data():
''' Fill 3 files with fake station data '''
fbase = 'datafile_'
for fnum in range(1, 4):
with open(fbase + str(fnum), 'w') as f:
for snum in range(station_hi):
for i in range(randrange(1, 4)):
s = '{1} data{0}{1}{2}'.format(fnum, snum, i)
print(s)
f.write(s + '\n')
print()
create_data()
# A file buffer that you can push lines back to
def file_buff(fh):
prev = None
while True:
while prev:
yield prev
prev = yield prev
prev = yield next(fh)
# An infinite counter that yields numbers converted to strings
def str_count(start=0):
n = start
while True:
yield str(n)
n += 1
# Extract station data from all 3 files
with open('datafile_1') as f1, open('datafile_2') as f2, open('datafile_3') as f3:
fb1, fb2, fb3 = file_buff(f1), file_buff(f2), file_buff(f3)
for snum_str in str_count():
station_lines = []
for fb in (fb1, fb2, fb3):
for line in fb:
#Extract station number string & station data
sid, sdata = line.split()
if sid != snum_str:
# This line contains data for the next station,
# so push it back to the buffer
rc = fb.send(line)
# and go to the next file
break
# Otherwise, append this data
station_lines.append(sdata)
#Process all the data lines for this station
if not station_lines:
#There's no more data to process
break
print('Station', snum_str)
print(station_lines)
output
0 data100
1 data110
1 data111
2 data120
3 data130
3 data131
4 data140
4 data141
0 data200
1 data210
2 data220
2 data221
3 data230
3 data231
3 data232
4 data240
4 data241
4 data242
0 data300
0 data301
1 data310
1 data311
2 data320
3 data330
4 data340
Station 0
['data100', 'data200', 'data300', 'data301']
Station 1
['data110', 'data111', 'data210', 'data310', 'data311']
Station 2
['data120', 'data220', 'data221', 'data320']
Station 3
['data130', 'data131', 'data230', 'data231', 'data232', 'data330']
Station 4
['data140', 'data141', 'data240', 'data241', 'data242', 'data340']
This code can cope if station data is missing for a particular station from one or two of the files, but not if it's missing from all three files, since it breaks the main processing loop when the station_lines list is empty, but that shouldn't be a problem for your data.
For details on generators and the generator.send method, please see 6.2.9. Yield expressions in the docs.
This code was developed using Python 3, but it will also run on Python 2.6+ (you just need to include from __future__ import print_function at the top of the script).
If there may be station ids missing from all 3 files we can easily handle that. Just use a simple range loop instead of the infinite str_count generator.
from random import seed, randrange
seed(123)
station_hi = 7
def create_data():
''' Fill 3 files with fake station data '''
fbase = 'datafile_'
for fnum in range(1, 4):
with open(fbase + str(fnum), 'w') as f:
for snum in range(station_hi):
for i in range(randrange(0, 2)):
s = '{1} data{0}{1}{2}'.format(fnum, snum, i)
print(s)
f.write(s + '\n')
print()
create_data()
# A file buffer that you can push lines back to
def file_buff(fh):
prev = None
while True:
while prev:
yield prev
prev = yield prev
prev = yield next(fh)
station_start = 0
station_stop = station_hi
# Extract station data from all 3 files
with open('datafile_1') as f1, open('datafile_2') as f2, open('datafile_3') as f3:
fb1, fb2, fb3 = file_buff(f1), file_buff(f2), file_buff(f3)
for i in range(station_start, station_stop):
snum_str = str(i)
station_lines = []
for fb in (fb1, fb2, fb3):
for line in fb:
#Extract station number string & station data
sid, sdata = line.split()
if sid != snum_str:
# This line contains data for the next station,
# so push it back to the buffer
rc = fb.send(line)
# and go to the next file
break
# Otherwise, append this data
station_lines.append(sdata)
if not station_lines:
continue
print('Station', snum_str)
print(station_lines)
output
1 data110
3 data130
4 data140
0 data200
1 data210
2 data220
6 data260
0 data300
4 data340
6 data360
Station 0
['data200', 'data300']
Station 1
['data110', 'data210']
Station 2
['data220']
Station 3
['data130']
Station 4
['data140', 'data340']
Station 6
['data260', 'data360']

I would suggest to use pandas.read_csv. You can specify the rows to skip using skiprows and also use a reasonable number of rows to load depending on your filesize using nrows
Here is a link to the documentation:
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

I posted the code below before #PM-2Ring posted his solution.
I would like to leave both solutions active:
The #1 solution that depends on Pandas library (by #DerFaizio). :
This solution finished 5450 stations in 120 minutes (about 45 stations/minute)
import pandas as pd
skips =[1, 1, 1] # to skip the header row forever
for station_number in range(100798):
storage = {}
tmax = pd.read_csv(full_paths[0], skiprows=skips[0], header=None, nrows=126000, usecols=[0, 1, 3])
tmin = pd.read_csv(full_paths[1], skiprows=skips[1], header=None, nrows=126000, usecols=[0, 1, 3])
tavg = pd.read_csv(full_paths[2], skiprows=skips[2], header=None, nrows=126000, usecols=[0, 1, 3])
# tmax is at position 0
for idx, station in enumerate(tmax[0]):
if station == station_number:
date_val = tmax[1][idx]
t_val = float(tmax[3][idx])/10.
storage[date_val] = [t_val, None, None]
skips[0] += 1
else:
break
# tmin is at position 1
for idx, station in enumerate(tmin[0]):
# station, date_val, _, val = lne.split(",")
if station == station_number:
date_val = tmin[1][idx]
t_val = float(tmin[3][idx]) / 10.
if date_val in storage:
storage[date_val][1] = t_val
else:
storage[date_val] = [None, t_val, None]
skips[1] += 1
else:
break
# tavg is at position 2
for idx, station in enumerate(tavg[0]):
...
# similar to Tmin
...
pass
station_info = []
for key in storage.keys():
# do some analysis
# Fill the list station_info
pass
data_out.writerows(station_info)
The following solution is the Generator based solution (by #PM-2Ring)
This solution finished 30000 stations in 12 minutes (about 2500 stations/minute)
files = ['Tmax', 'Tmin', 'Tavg']
headers = ['Nesr_Id', 'r_Year', 'r_Month', 'r_Day', 'Tmax', 'Tmin', 'Tavg']
# A file buffer that you can push lines back to
def file_buff(fh):
prev = None
while True:
while prev:
yield prev
prev = yield prev
prev = yield next(fh)
# An infinite counter that yields numbers converted to strings
def str_count(start=0):
n = start
while True:
yield str(n)
n += 1
# NULL = -999.99
print "Time started: {}".format(time.strftime('%Y-%m-%d %H:%M:%S'))
with open('Results\\GHCN_Daily\\Important\\Temp_All_out_gen.csv', 'wb+') as out_file:
data_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, quotechar='', delimiter=',', escapechar='\\',
lineterminator='\n')
data_out.writerow(headers)
full_paths = [os.path.join(source, '{}.csv'.format(file_name)) for file_name in files]
# Extract station data from all 3 files
with open(full_paths[0]) as f1, open(full_paths[1]) as f2, open(full_paths[0]) as f3:
fb1, fb2, fb3 = file_buff(f1), file_buff(f2), file_buff(f3)
for snum_str in str_count():
# station_lines = []
storage ={}
count = [0, 0, 0]
for file_id, fb in enumerate((fb1, fb2, fb3)):
for line in fb:
if not isinstance(get__proper_data_type(line.split(",")[0]), str):
# Extract station number string & station data
sid, date_val, _dummy, sdata = line.split(",")
if sid != snum_str:
# This line contains data for the next station,
# so push it back to the buffer
rc = fb.send(line)
# and go to the next file
break
# Otherwise, append this data
sdata = float(sdata) / 10.
count[file_id] += 1
if date_val in storage:
storage[date_val][file_id] = sdata
else:
storage[date_val]= [sdata, None, None]
# station_lines.append(sdata)
# # Process all the data lines for this station
# if not station_lines:
# # There's no more data to process
# break
print "St# {:6d}/100797. Time: {}. Tx({}), Tn({}), Ta({}) ".\
format(int(snum_str), time.strftime('%H:%M:%S'), count[0], count[1], count[2])
# print(station_lines)
station_info = []
for key in storage.keys():
# key_val = storage[key]
tx, tn, ta = storage[key]
if ta is None:
if tx != None and tn != None:
ta = round((tx + tn) / 2., 1)
if tx is None:
if tn != None and ta != None:
tx = round(2. * ta - tn, 1)
if tn is None:
if tx != None and ta != None:
tn = round(2. * ta - tx, 1)
# print key,
py_date = from_excel_ordinal(int(key))
# print py_date
station_info.append([snum_str, py_date.year, py_date.month, py_date.day, tx, tn, ta])
data_out.writerows(station_info)
del station_info
Thanks for all.

Going with the built-in csv module, you could do something like:
with open(csvfile, 'r') as f:
reader = csv.reader(f, delimiter=',')
for i in range(n):
reader.next()
for row in reader:
print row # Or whatever you want to do here
Where n is the number of lines you want to skip.

Python file preprocessing (convert column from discrete ranges of values to contiguous range of values.)

I have a dataset of the form:
user_id::item_id1::rating::timestamp
user_id::item_id2::rating::timestamp
user_id::item_id3::rating::timestamp
user_id::item_id4::rating::timestamp
I require the item_ids (there are n distinct item ids in sorted order. Subsequent rows could have the same item ids or different but its guaranteed to be sorted) to be contiguous from 1 to n and they are currently ranging from 1 to k
for k >> n
I have the following code but it isn't quite correct and have been at it for a couple of hours so would really appreciate any help regarding this or if there is a simpler way to do this in python I would really appreciate guidance regarding that as well.
I currently have the following code:
def reOrderItemIds(inputFile,outputFile):
#This is a list in the range of 1 to 10681.
itemIdsRange = set(range(1,10682))
#currKey = 1
currKey = itemIdsRange.pop()
lastContiguousKey=1
#currKey+1
contiguousKey=itemIdsRange.pop()
f = open(inputFile)
g = open(outputFile,"w")
oldKeyToNewKeyMap = dict()
for line in f:
if int(line.split(":")[1]) == currKey and int(line.split(":")[1])==lastContiguousKey:
g.write(line)
elif int(line.split(":")[1])!=currKey and int(line.split(":")[1])!=contiguousKey:
oldKeyToNewKeyMap[line.split(":")[1]]=contiguousKey
lastContiguousKey=contiguousKey
#update current key to the value of the current key.
currKey=int(line.split(":")[1])
contiguousKey=itemIdsRange.pop()
g.write(line.split(":")[0]+":"+str(lastContiguousKey)+":"+line.split(":")[2]+":"+line.split(":")[3])
elif int(line.split(":")[1])==currKey and int(line.split(":")[1])!=contiguousKey:
g.write(line.split(":")[0]+":"+str(lastContiguousKey)+":"+line.split(":")[2]+":"+line.split(":")[3])
elif int(line.split(":")[1])!=currKey and int(line.split(":")[1])==contiguousKey:
currKey = int(line.split(":")[1])
lastContiguousKey=contiguousKey
oldKeyToNewKeyMap[line.split(":")[1]] = lastContiguousKey
contiguousKey=itemIdsRange.pop()
g.write(line.split(":")[0]+":"+str(lastContiguousKey)+":"+line.split(":")[2]+":"+line.split(":")[3])
f.close()
g.close()
Example:
1::1::3::100
10::1::5::104
20::2::3::110
1::5::2::104
I require the output to be of the form:
1::1::3::100
10::1::5::104
20::2::3::110
1::3::2::104
so only the item_ids column changes and everything else remains the same.
Any help would be much appreciated!

Because your data is already sorted by item_id - you can use itertools.groupby() which makes easy work of the solution.
from operator import itemgetter
from itertools import groupby
item_id = itemgetter(1)
def reOrderItemIds(inputFile,outputFile):
n = 1
with open(inputFile)as infile, open(outputFile,"w") as outfile:
dataset = (line.split('::') for line in infile)
for key, group in groupby(dataset, item_id):
for line in group:
line[1] = str(n)
outfile.write('::'.join(line))
n += 1

With my apologies for grossly misreading your question the first time, suppose data is a file containing
1::1::3::100
10::1::5::104
20::2::3::110
30::5::3::121
40::9::7::118
50::10::2::104
(If your data cannot all be cast to integers, this could be modified.)
>>> with open('data', 'r') as datafile:
... dataset = datafile.read().splitlines()
...
>>> ids = {0}
>>> for i, line in enumerate(dataset):
... data = list(map(int, line.split('::')))
... if data[1] not in ids:
... data[1] = max(ids) + 1
... ids.add(data[1])
... dataset[i] = '::'.join((str(d) for d in data))
...
>>> print('\n'.join(dataset))
1::1::3::100
10::1::5::104
20::2::3::110
30::3::3::121
40::4::7::118
50::5::2::104
Again, if your dataset is large, faster solutions can be devised.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.