Related
I started out with a 4d list, something like
tokens = [[[["a"], ["b"], ["c"]], [["d"]]], [[["e"], ["f"], ["g"]],[["h"], ["i"], ["j"], ["k"], ["l"]]]]
So I converted this to a csv file using the code
import csv
def export_to_csv(tokens):
csv_list = [["A", "B", "C", word]]
for h_index, h in enumerate(tokens):
for i_index, i in enumerate(h):
for j_index, j in enumerate(i):
csv_list.append([h_index, i_index, j_index, j])
with open('TEST.csv', 'w') as f:
# using csv.writer method from CSV package
write = csv.writer(f)
write.writerows(csv_list)
But now I want to do the reverse process, want to convert a csv file obtained in this format, back to the list format mentioned above.
Assuming you wanted your csv file to look something like this (there were a couple typos in the posted code):
A,B,C,word
0,0,0,a
0,0,1,b
0,0,2,c
...
here's one solution:
import csv
def import_from_csv(filename):
retval = []
with open(filename) as fh:
reader = csv.reader(fh)
# discard header row
next(reader)
# process data rows
for (x,y,z,word) in reader:
x = int(x)
y = int(y)
z = int(z)
retval.extend([[[]]] * (x + 1 - len(retval)))
retval[x].extend([[]] * (y + 1 - len(retval[x])))
retval[x][y].extend([0] * (z + 1 - len(retval[x][y])))
retval[x][y][z] = [word]
return retval
def import_from_csv(file):
import ast
import csv
data = []
# Read the CSV file
with open(file) as fp:
reader = csv.reader(fp)
# Skip the first line, which contains the headers
next(reader)
for line in reader:
# Read the first 3 elements of the line
a, b, c = [int(i) for i in line[:3]]
# When we read it back, everything comes in as strings. Use
# `literal_eval` to convert it to a Python list
value = ast.literal_eval(line[3])
# Extend the list to accomodate the new element
data.append([[[]]]) if len(data) < a + 1 else None
data[a].append([[]]) if len(data[a]) < b + 1 else None
data[a][b].append([]) if len(data[a][b]) < c + 1 else None
data[a][b][c] = value
return data
# Test
assert import_from_csv("TEST.csv") == tokens
First, I'd make writing this construction in a CSV format independent from dimensions:
import csv
def deep_iter(seq):
for i, val in enumerate(seq):
if type(val) is list:
for others in deep_iter(val):
yield i, *others
else:
yield i, val
with open('TEST.csv', 'w') as f:
csv.writer(f).writerows(deep_iter(tokens))
Next, we can use the lexicographic order of the indices to recreate the structure. All we have to do is sequentially move deeper into the output list according to the indices of a word. We stop at the penultimate index to get the last list, because the last index is pointing only at the place of the word in this list and doesn't matter due to the natural ordering:
with open('TEST.csv', 'r') as f:
rows = [*csv.reader(f)]
res = []
for r in rows:
index = r[:-2] # skip the last index and word
e = res
while index:
i = int(index.pop(0)) # get next part of a current index
if i < len(e):
e = e[i]
else:
e.append([]) # add new record at this level
e = e[-1]
e.append(r[-1]) # append the word to the corresponding list
I want to sum all values that have the same name / ID in a csv file
Right now I am only looking for ID with the name 'company'
csv file format:
company A, 100
company B, 200
company A, 300
The end result I am looking for is:
company A, 400
company B, 200
total: 600
My code so far:
import csv
name = ''
num = ''
total = 0
with open('xx.csv', 'r', newline='') as csvfile:
reader = csv.reader(csvfile)
next(csvfile)
for a in reader:
if a[0].__contain__('company'):
name = (a[0])
num = (a[1])
total += float(a[1])
print(str(name) + ', ' + str(num))
print('total: ' + str(total))
First, CSV typically have commas, and the delimiter for csv.reader must be a single character, so I suggest updating your file to properly use commas.
Secondly, to aggregate the companies, you need to store them as you iterate the file. Easiest way is to use a dictionary type.
Then only after you've aggregated everything, should you create a second loop to go over the aggregated values, then print the final total.
import csv
from collections import defaultdict
totals = defaultdict(int)
total = 0
with open('companies.csv') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
# next (csvfile) # shown file has no header
for row in reader:
if not row[0].startswith('company'):
continue
name, value = row
totals[name] += float(value)
total = 0
for name, value in totals.items():
print(f'{name},{value}')
total += value
print(f'total: {total}')
You don't necessarily need to use csv module here. Just read every single line split them from right (rsplit) and fill a dictionary like below:
d = {}
with open('your_file.csv') as f:
# next(f) - If header needs to be skipped
for line in f:
name, value = line.rsplit(',', maxsplit=1)
d[name] = d.get(name, 0) + int(value)
for k, v in d.items():
print(f"{k}, {v}")
print(f"total: {sum(d.values())}")
output:
company A, 400
company B, 200
total: 600
In order not to iterate again through the dictionary's values to calculate the total(I mean in sum(d.values()) expression), you can do add to total while you are printing the items like:
d = {}
with open('new.csv') as f:
for line in f:
name, value = line.rsplit(',', maxsplit=1)
d[name] = d.get(name, 0) + int(value)
total = 0
for k, v in d.items():
total += v
print(f"{k}, {v}")
print(f"total: {total}")
I am trying to open a csv file with csv.DictReader, read in just the first 5 rows of data, perform the primary process of my script, then read in the next 5 rows and do the same for them. Rinse and repeat.
I believe I have a method that works, however I am having issues with the last lines of the data not processing. I know I need to modify my if statement so that it also checks for if I'm at the end of the file, but am having trouble finding a way to do that. I've found methods online, but they involve reading in the whole file to get a row count but doing so would defeat the purpose of this script as I'm dealing with memory issues.
Here is what I have so far:
import csv
count = 0
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for row in reader:
count +=1
data.append(row)
if count % 5 == 0 or #something to check for the end of the file:
#do stuff
data = []
Thank you for the help!
You can use the chunksize argument when reading in the csv. This will step by step read in the number of lines:
import pandas as pd
reader = pd.read_csv('test.csv', chunksize=5)
for df in reader:
# do stuff
You can handle the remaining lines after the for loop body. You can also use the more pythonic enumerate.
import csv
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for count, row in enumerate(reader, 1):
data.append(row)
if count % 5 == 0:
# do stuff
data = []
print('handling remaining lines at end of file')
print(data)
considering the file
a,b
1,1
2,2
3,3
4,4
5,5
6,6
7,7
outputs
handling remaining lines at end of file
[OrderedDict([('a', '6'), ('b', '6')]), OrderedDict([('a', '7'), ('b', '7')])]
This is one approach using the iterator
Ex:
import csv
with open('test.csv') as file:
reader = csv.DictReader(file)
value = True
while value:
data = []
for _ in range(5): # Get 5 rows
value = next(reader, False)
if value:
data.append(value)
print(data) #List of 5 elements
Staying along the lines of what you wrote and not including any other imports:
import csv
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for row in reader:
data.append(row)
if len(data) > 5:
del data[0]
if len(data) == 5:
# Do something with the 5 elements
print(data)
The if statements allow the array to be loaded with 5 elements before processing on the begins.
class ZeroItterNumberException(Exception):
pass
class ItterN:
def __init__(self, itterator, n):
if n<1:
raise ZeroItterNumberException("{} is not a valid number of rows.".format(n))
self.itterator = itterator
self.n = n
self.cache = []
def __iter__(self):
return self
def __next__(self):
self.cache.append(next(self.itterator))
if len(self.cache) < self.n:
return self.__next__()
if len(self.cache) > self.n:
del self.cache[0]
if len(self.cache) == 5:
return self.cache
I have to read in a file line by line that has indices of where a vector has 1's
so for example:
1 3 9 10
means:
0,1,0,1,0,0,0,0,0,1,1
My goal is to write program that will take each line and print out the full vector with the 0's.
I am able to do this with my current program for a few lines:
#create a sparse vector
list_line_sparse = [0] * int(num_features)
#loop over all the lines
for item in lines:
#split the line on spaces
zz = item.split(' ')
#get all ints on a line
d = [int(x.strip()) for x in zz]
#loop over all ints and change index to 1 in sparse vector
for i in d:
list_line_sparse[i]=1
out_file += (', '.join(str(item) for item in list_line_sparse))
#change back to 0's
for i in d:
list_line_sparse[i]=0
out_file +='\n'
f = open('outfile', 'w')
f.write(out_file)
f.close()
The problem is that for a file with a lot of features and lines, my program is very very inefficient - it basically never finishes. Is there anything sticking out that I should change to make it more efficent? (I.e. the 2 for loops)
It would probably be more efficient to write each line of data to your output file as it is generated, rather than building up a huge string in memory.
numpy is a popular Python module that's good for doing bulk operations on numbers. If you start with:
import numpy as np
list_line_sparse = np.zeros(num_features, dtype=np.uint8)
Then, given d as the list of numbers on the current line, you can simply do:
list_line_sparse[d] = 1
to set ALL of those indexes in the array at the same time, no loop required. (At the Python level at least, obviously there's still a loop involved, but it's down in the C implementation of numpy).
It is slowing down because you are doing string concatenation. It is better to work with lists.
Also you could use csv to read your space separated lines in, and to then write each row with commas automatically added:
import csv
num_features = 20
with open('input.txt', 'r', newline='') as f_input, open('output.txt', 'w', newline='') as f_output:
csv_input = csv.reader(f_input, delimiter=' ')
csv_output = csv.writer(f_output)
for row in csv_input:
list_line_sparse = [0] * int(num_features)
for v in map(int, row):
list_line_sparse[v] = 1
csv_output.writerow(list_line_sparse)
So if input.txt contained the following:
1 3 9 10
1 3 9 11
2 7 3 5
Giving you an output.txt containing:
0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Too much loops: first, the item.split(), then the for x in zz, then for i in d, then for item in list_line_sparse, and then for i in d again. Strings concatenations could be your most expensive part: the .join and the output +=. And all this for every line.
You could try a "character by character" parsing and writing. Something like this:
#features per line
count = int(num_features)
f = open('outfile.txt', 'w')
#loop over all lines
for item in lines:
#reset the feature
i = 0
#the characters buffer
index = ""
#parse character by character
for character in item:
#if a space or end of line is found,
#and the characters buffer (index) is not empty
if character in (" ", "\r", "\n"):
if index:
#parse the characters buffer
index = int(index)
#if is not the first feature
if i > 0:
#add the separator
f.write(", ")
#add 0's until index
while i < index:
f.write("0, ")
i += 1
#and write 1
f.write("1")
i += 1
#reset the characters buffer
index = ""
#if is not a space or end on line
else:
#add the character to the buffer
index += character
#if the last line didn't end with a carriage return,
#index could be waiting to be parsed
if index:
index = int(index)
if i > 0:
f.write(", ")
while i < index:
f.write("0, ")
i += 1
f.write("1")
i += 1
index = ""
#fill with 0's
while i < count:
if i == 0:
f.write("0")
else:
f.write(", 0")
i += 1
f.write("\n")
f.close()
Let's rework your code into a simpler package that takes better advantage of Python's features:
import sys
NUM_FEATURES = 12
with open(sys.argv[1]) as source, open(sys.argv[2], 'w') as sink:
for line in source:
list_line_sparse = [0] * NUM_FEATURES
indicies = map(int, line.rstrip().split())
for index in indicies:
list_line_sparse[index] = 1
print(*list_line_sparse, file=sink, sep=',')
I revisited this problem with your "more efficiently" in mind. Although the above is more memory efficient, it is a hair slower time-wise. I reconsidered your original and came up with a solution that is less memory efficient but about 2x faster than your code:
import sys
NUM_FEATURES = 12
data = ''
with open(sys.argv[1]) as source:
for line in source:
list_line_sparse = ["0"] * NUM_FEATURES
indicies = map(int, line.rstrip().split())
for index in indicies:
list_line_sparse[index] = "1"
data += ",".join(list_line_sparse) + '\n'
with open(sys.argv[2], 'w') as sink:
sink.write(data)
Like your original solution, it stores all the data in memory and writes it out at the end which is both a disadvantage (memory-wise) and an advantage (time-wise.)
input.txt
1 3 9 10
1 3 9 11
2 7 3 5
USAGE
% python3 test.py input.txt output.txt
output.txt
0,1,0,1,0,0,0,0,0,1,1,0
0,1,0,1,0,0,0,0,0,1,0,1
0,0,1,1,0,1,0,1,0,0,0,0
I have 3 huge CSV files containing climate data, each about 5GB.
The first cell in each line is the meteorological station's number (from 0 to about 100,000) each station contains from 1 to 800 lines in each file, which is not necessarily equal in all files. For example, Station 11 has 600, 500, and 200 lines in file1, file2, and file3 respectively.
I want to read all the lines of each station, do some operations on them, then write results to another file, then the next station, etc.
The files are too large to load at once in memory, so I tried some solutions to read them with minimal memory load, like this post and this post which include this method:
with open(...) as f:
for line in f:
<do something with line>
The problem with this method that it reads the file from the beginning every time, while I want to read files as follows:
for station in range (100798):
with open (file1) as f1, open (file2) as f2, open (file3) as f3:
for line in f1:
st = line.split(",")[0]
if st == station:
<store this line for some analysis>
else:
break # break the for loop and go to read the next file
for line in f2:
...
<similar code to f1>
...
for line in f3:
...
<similar code to f1>
...
<do the analysis to station, the go to next station>
The problem is that each time I start over to take next station, the for loop would start from the beginning, while I want it to start from where the 'Break' occurs at the nth line, i.e. to continue reading the file.
How can I do it?
Thanks in advance
Notes About the solutions below:
As I mentioned below at the time I posted my answer, I implemented the answer of #DerFaizio but I found it very slow in processing.
After I had tried the generator-based answer submitted by #PM_2Ring I found it very very fast. Maybe because it depends on Generators.
The difference between the two solutions can be noticed by the numbers of processed stations per minutes which are 2500 st/min for the generator based solution, and 45 st/min for the Pandas based solution. where the Generator based solution is >55 times faster.
I will keep both implementations below for reference.
Many thanks to all contributors, especially #PM_2Ring.
The code below iterates over the files line by line, grabbing the lines for each station from each file in turn and appending them to a list for further processing.
The heart of this code is a generator file_buff that yields the lines of a file but which allows us to push a line back for later reading. When we read a line for the next station we can send it back to file_buff so that we can re-read it when it's time to process the lines for that station.
To test this code, I created some simple fake station data using create_data.
from random import seed, randrange
seed(123)
station_hi = 5
def create_data():
''' Fill 3 files with fake station data '''
fbase = 'datafile_'
for fnum in range(1, 4):
with open(fbase + str(fnum), 'w') as f:
for snum in range(station_hi):
for i in range(randrange(1, 4)):
s = '{1} data{0}{1}{2}'.format(fnum, snum, i)
print(s)
f.write(s + '\n')
print()
create_data()
# A file buffer that you can push lines back to
def file_buff(fh):
prev = None
while True:
while prev:
yield prev
prev = yield prev
prev = yield next(fh)
# An infinite counter that yields numbers converted to strings
def str_count(start=0):
n = start
while True:
yield str(n)
n += 1
# Extract station data from all 3 files
with open('datafile_1') as f1, open('datafile_2') as f2, open('datafile_3') as f3:
fb1, fb2, fb3 = file_buff(f1), file_buff(f2), file_buff(f3)
for snum_str in str_count():
station_lines = []
for fb in (fb1, fb2, fb3):
for line in fb:
#Extract station number string & station data
sid, sdata = line.split()
if sid != snum_str:
# This line contains data for the next station,
# so push it back to the buffer
rc = fb.send(line)
# and go to the next file
break
# Otherwise, append this data
station_lines.append(sdata)
#Process all the data lines for this station
if not station_lines:
#There's no more data to process
break
print('Station', snum_str)
print(station_lines)
output
0 data100
1 data110
1 data111
2 data120
3 data130
3 data131
4 data140
4 data141
0 data200
1 data210
2 data220
2 data221
3 data230
3 data231
3 data232
4 data240
4 data241
4 data242
0 data300
0 data301
1 data310
1 data311
2 data320
3 data330
4 data340
Station 0
['data100', 'data200', 'data300', 'data301']
Station 1
['data110', 'data111', 'data210', 'data310', 'data311']
Station 2
['data120', 'data220', 'data221', 'data320']
Station 3
['data130', 'data131', 'data230', 'data231', 'data232', 'data330']
Station 4
['data140', 'data141', 'data240', 'data241', 'data242', 'data340']
This code can cope if station data is missing for a particular station from one or two of the files, but not if it's missing from all three files, since it breaks the main processing loop when the station_lines list is empty, but that shouldn't be a problem for your data.
For details on generators and the generator.send method, please see 6.2.9. Yield expressions in the docs.
This code was developed using Python 3, but it will also run on Python 2.6+ (you just need to include from __future__ import print_function at the top of the script).
If there may be station ids missing from all 3 files we can easily handle that. Just use a simple range loop instead of the infinite str_count generator.
from random import seed, randrange
seed(123)
station_hi = 7
def create_data():
''' Fill 3 files with fake station data '''
fbase = 'datafile_'
for fnum in range(1, 4):
with open(fbase + str(fnum), 'w') as f:
for snum in range(station_hi):
for i in range(randrange(0, 2)):
s = '{1} data{0}{1}{2}'.format(fnum, snum, i)
print(s)
f.write(s + '\n')
print()
create_data()
# A file buffer that you can push lines back to
def file_buff(fh):
prev = None
while True:
while prev:
yield prev
prev = yield prev
prev = yield next(fh)
station_start = 0
station_stop = station_hi
# Extract station data from all 3 files
with open('datafile_1') as f1, open('datafile_2') as f2, open('datafile_3') as f3:
fb1, fb2, fb3 = file_buff(f1), file_buff(f2), file_buff(f3)
for i in range(station_start, station_stop):
snum_str = str(i)
station_lines = []
for fb in (fb1, fb2, fb3):
for line in fb:
#Extract station number string & station data
sid, sdata = line.split()
if sid != snum_str:
# This line contains data for the next station,
# so push it back to the buffer
rc = fb.send(line)
# and go to the next file
break
# Otherwise, append this data
station_lines.append(sdata)
if not station_lines:
continue
print('Station', snum_str)
print(station_lines)
output
1 data110
3 data130
4 data140
0 data200
1 data210
2 data220
6 data260
0 data300
4 data340
6 data360
Station 0
['data200', 'data300']
Station 1
['data110', 'data210']
Station 2
['data220']
Station 3
['data130']
Station 4
['data140', 'data340']
Station 6
['data260', 'data360']
I would suggest to use pandas.read_csv. You can specify the rows to skip using skiprows and also use a reasonable number of rows to load depending on your filesize using nrows
Here is a link to the documentation:
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
I posted the code below before #PM-2Ring posted his solution.
I would like to leave both solutions active:
The #1 solution that depends on Pandas library (by #DerFaizio). :
This solution finished 5450 stations in 120 minutes (about 45 stations/minute)
import pandas as pd
skips =[1, 1, 1] # to skip the header row forever
for station_number in range(100798):
storage = {}
tmax = pd.read_csv(full_paths[0], skiprows=skips[0], header=None, nrows=126000, usecols=[0, 1, 3])
tmin = pd.read_csv(full_paths[1], skiprows=skips[1], header=None, nrows=126000, usecols=[0, 1, 3])
tavg = pd.read_csv(full_paths[2], skiprows=skips[2], header=None, nrows=126000, usecols=[0, 1, 3])
# tmax is at position 0
for idx, station in enumerate(tmax[0]):
if station == station_number:
date_val = tmax[1][idx]
t_val = float(tmax[3][idx])/10.
storage[date_val] = [t_val, None, None]
skips[0] += 1
else:
break
# tmin is at position 1
for idx, station in enumerate(tmin[0]):
# station, date_val, _, val = lne.split(",")
if station == station_number:
date_val = tmin[1][idx]
t_val = float(tmin[3][idx]) / 10.
if date_val in storage:
storage[date_val][1] = t_val
else:
storage[date_val] = [None, t_val, None]
skips[1] += 1
else:
break
# tavg is at position 2
for idx, station in enumerate(tavg[0]):
...
# similar to Tmin
...
pass
station_info = []
for key in storage.keys():
# do some analysis
# Fill the list station_info
pass
data_out.writerows(station_info)
The following solution is the Generator based solution (by #PM-2Ring)
This solution finished 30000 stations in 12 minutes (about 2500 stations/minute)
files = ['Tmax', 'Tmin', 'Tavg']
headers = ['Nesr_Id', 'r_Year', 'r_Month', 'r_Day', 'Tmax', 'Tmin', 'Tavg']
# A file buffer that you can push lines back to
def file_buff(fh):
prev = None
while True:
while prev:
yield prev
prev = yield prev
prev = yield next(fh)
# An infinite counter that yields numbers converted to strings
def str_count(start=0):
n = start
while True:
yield str(n)
n += 1
# NULL = -999.99
print "Time started: {}".format(time.strftime('%Y-%m-%d %H:%M:%S'))
with open('Results\\GHCN_Daily\\Important\\Temp_All_out_gen.csv', 'wb+') as out_file:
data_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, quotechar='', delimiter=',', escapechar='\\',
lineterminator='\n')
data_out.writerow(headers)
full_paths = [os.path.join(source, '{}.csv'.format(file_name)) for file_name in files]
# Extract station data from all 3 files
with open(full_paths[0]) as f1, open(full_paths[1]) as f2, open(full_paths[0]) as f3:
fb1, fb2, fb3 = file_buff(f1), file_buff(f2), file_buff(f3)
for snum_str in str_count():
# station_lines = []
storage ={}
count = [0, 0, 0]
for file_id, fb in enumerate((fb1, fb2, fb3)):
for line in fb:
if not isinstance(get__proper_data_type(line.split(",")[0]), str):
# Extract station number string & station data
sid, date_val, _dummy, sdata = line.split(",")
if sid != snum_str:
# This line contains data for the next station,
# so push it back to the buffer
rc = fb.send(line)
# and go to the next file
break
# Otherwise, append this data
sdata = float(sdata) / 10.
count[file_id] += 1
if date_val in storage:
storage[date_val][file_id] = sdata
else:
storage[date_val]= [sdata, None, None]
# station_lines.append(sdata)
# # Process all the data lines for this station
# if not station_lines:
# # There's no more data to process
# break
print "St# {:6d}/100797. Time: {}. Tx({}), Tn({}), Ta({}) ".\
format(int(snum_str), time.strftime('%H:%M:%S'), count[0], count[1], count[2])
# print(station_lines)
station_info = []
for key in storage.keys():
# key_val = storage[key]
tx, tn, ta = storage[key]
if ta is None:
if tx != None and tn != None:
ta = round((tx + tn) / 2., 1)
if tx is None:
if tn != None and ta != None:
tx = round(2. * ta - tn, 1)
if tn is None:
if tx != None and ta != None:
tn = round(2. * ta - tx, 1)
# print key,
py_date = from_excel_ordinal(int(key))
# print py_date
station_info.append([snum_str, py_date.year, py_date.month, py_date.day, tx, tn, ta])
data_out.writerows(station_info)
del station_info
Thanks for all.
Going with the built-in csv module, you could do something like:
with open(csvfile, 'r') as f:
reader = csv.reader(f, delimiter=',')
for i in range(n):
reader.next()
for row in reader:
print row # Or whatever you want to do here
Where n is the number of lines you want to skip.