I am trying to open a csv file with csv.DictReader, read in just the first 5 rows of data, perform the primary process of my script, then read in the next 5 rows and do the same for them. Rinse and repeat.
I believe I have a method that works, however I am having issues with the last lines of the data not processing. I know I need to modify my if statement so that it also checks for if I'm at the end of the file, but am having trouble finding a way to do that. I've found methods online, but they involve reading in the whole file to get a row count but doing so would defeat the purpose of this script as I'm dealing with memory issues.
Here is what I have so far:
import csv
count = 0
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for row in reader:
count +=1
data.append(row)
if count % 5 == 0 or #something to check for the end of the file:
#do stuff
data = []
Thank you for the help!
You can use the chunksize argument when reading in the csv. This will step by step read in the number of lines:
import pandas as pd
reader = pd.read_csv('test.csv', chunksize=5)
for df in reader:
# do stuff
You can handle the remaining lines after the for loop body. You can also use the more pythonic enumerate.
import csv
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for count, row in enumerate(reader, 1):
data.append(row)
if count % 5 == 0:
# do stuff
data = []
print('handling remaining lines at end of file')
print(data)
considering the file
a,b
1,1
2,2
3,3
4,4
5,5
6,6
7,7
outputs
handling remaining lines at end of file
[OrderedDict([('a', '6'), ('b', '6')]), OrderedDict([('a', '7'), ('b', '7')])]
This is one approach using the iterator
Ex:
import csv
with open('test.csv') as file:
reader = csv.DictReader(file)
value = True
while value:
data = []
for _ in range(5): # Get 5 rows
value = next(reader, False)
if value:
data.append(value)
print(data) #List of 5 elements
Staying along the lines of what you wrote and not including any other imports:
import csv
data = []
with open('test.csv') as file:
reader = csv.DictReader(file)
for row in reader:
data.append(row)
if len(data) > 5:
del data[0]
if len(data) == 5:
# Do something with the 5 elements
print(data)
The if statements allow the array to be loaded with 5 elements before processing on the begins.
class ZeroItterNumberException(Exception):
pass
class ItterN:
def __init__(self, itterator, n):
if n<1:
raise ZeroItterNumberException("{} is not a valid number of rows.".format(n))
self.itterator = itterator
self.n = n
self.cache = []
def __iter__(self):
return self
def __next__(self):
self.cache.append(next(self.itterator))
if len(self.cache) < self.n:
return self.__next__()
if len(self.cache) > self.n:
del self.cache[0]
if len(self.cache) == 5:
return self.cache
Related
I started out with a 4d list, something like
tokens = [[[["a"], ["b"], ["c"]], [["d"]]], [[["e"], ["f"], ["g"]],[["h"], ["i"], ["j"], ["k"], ["l"]]]]
So I converted this to a csv file using the code
import csv
def export_to_csv(tokens):
csv_list = [["A", "B", "C", word]]
for h_index, h in enumerate(tokens):
for i_index, i in enumerate(h):
for j_index, j in enumerate(i):
csv_list.append([h_index, i_index, j_index, j])
with open('TEST.csv', 'w') as f:
# using csv.writer method from CSV package
write = csv.writer(f)
write.writerows(csv_list)
But now I want to do the reverse process, want to convert a csv file obtained in this format, back to the list format mentioned above.
Assuming you wanted your csv file to look something like this (there were a couple typos in the posted code):
A,B,C,word
0,0,0,a
0,0,1,b
0,0,2,c
...
here's one solution:
import csv
def import_from_csv(filename):
retval = []
with open(filename) as fh:
reader = csv.reader(fh)
# discard header row
next(reader)
# process data rows
for (x,y,z,word) in reader:
x = int(x)
y = int(y)
z = int(z)
retval.extend([[[]]] * (x + 1 - len(retval)))
retval[x].extend([[]] * (y + 1 - len(retval[x])))
retval[x][y].extend([0] * (z + 1 - len(retval[x][y])))
retval[x][y][z] = [word]
return retval
def import_from_csv(file):
import ast
import csv
data = []
# Read the CSV file
with open(file) as fp:
reader = csv.reader(fp)
# Skip the first line, which contains the headers
next(reader)
for line in reader:
# Read the first 3 elements of the line
a, b, c = [int(i) for i in line[:3]]
# When we read it back, everything comes in as strings. Use
# `literal_eval` to convert it to a Python list
value = ast.literal_eval(line[3])
# Extend the list to accomodate the new element
data.append([[[]]]) if len(data) < a + 1 else None
data[a].append([[]]) if len(data[a]) < b + 1 else None
data[a][b].append([]) if len(data[a][b]) < c + 1 else None
data[a][b][c] = value
return data
# Test
assert import_from_csv("TEST.csv") == tokens
First, I'd make writing this construction in a CSV format independent from dimensions:
import csv
def deep_iter(seq):
for i, val in enumerate(seq):
if type(val) is list:
for others in deep_iter(val):
yield i, *others
else:
yield i, val
with open('TEST.csv', 'w') as f:
csv.writer(f).writerows(deep_iter(tokens))
Next, we can use the lexicographic order of the indices to recreate the structure. All we have to do is sequentially move deeper into the output list according to the indices of a word. We stop at the penultimate index to get the last list, because the last index is pointing only at the place of the word in this list and doesn't matter due to the natural ordering:
with open('TEST.csv', 'r') as f:
rows = [*csv.reader(f)]
res = []
for r in rows:
index = r[:-2] # skip the last index and word
e = res
while index:
i = int(index.pop(0)) # get next part of a current index
if i < len(e):
e = e[i]
else:
e.append([]) # add new record at this level
e = e[-1]
e.append(r[-1]) # append the word to the corresponding list
I am working on this function for my course. It is a requirement that I create a function that counts the number of cameras in each location. I am a novice, I don't understand why the loop doesn't continue through the entire CSV file. See output.
import csv
def nr_cameras(location_data):
qty=0
cameras={}
with open(location_data, 'r') as p:
csv_reader = csv.DictReader (p, delimiter = ';')
for row in csv_reader:
cameras[row['Kommun']] = cameras.get(row['Kommun'], 0) + 1
for k in cameras.keys():
return (f"{k} {cameras [k]}")
location_data = nr_cameras('locationData.csv')
print('Kommun Nr of cameras')
print('--------------------------')
print(location_data)
Current output:
Kommun Nr of cameras
--------------------------
Vänersborg 3
Goal output:
Kommun Nr of cameras
---------------------------
Alingsås 17
Bengtsfors 4
Essunga 10
Falköping 28
...
Vara 7
Vänersborg 20
Vårgårda 13
---------------------------
There is a total of 297 cameras.
The csv file looks something like this:
MätplatsID;Namn;Vägnummer;Kommun
14002010;Bhpl Gestadvägen;E45;Vänersborg
14002080;Bhpl Asmundebyn;E45;Vänersborg
14002120;Östebyn Söder;E45;Vänersborg
14005010;VVIS station;161;Uddevalla
14005020;Nybygget V;161;Uddevalla
14006010;Bälinge nö;1900;Alingsås
14006030;Torp;1900;Vårgårda
14006040;Hols skola;1900;Vårgårda
Simply change return to print and then return the dictionary at the end. In Python, as soon as a method returns a value, method completes its call. Hence, avoid using return in a loop.
def nr_cameras(location_data):
cameras = {}
with open(location_data, 'r') as p:
csv_reader = csv.DictReader(p, delimiter = ';')
for row in csv_reader:
cameras[row['Kommun']] = cameras.get(row['Kommun'], 0) + 1
for k in cameras.keys():
print(f"{k} {cameras [k]}")
return cameras
print('Kommun Nr of cameras')
print('--------------------------')
location_data = nr_cameras('locationData.csv')
Alternatively, handle all printing outside the method:
def nr_cameras(location_data):
cameras = {}
with open(location_data, 'r') as p:
csv_reader = csv.DictReader(p, delimiter = ';')
for row in csv_reader:
cameras[row['Kommun']] = cameras.get(row['Kommun'], 0) + 1
return cameras
location_data = nr_cameras('locationData.csv')
print('Kommun Nr of cameras')
print('--------------------------')
for k,v in location_data.items():
print(f"{k} {v}")
In your loop over the camera entries, you are returning the first entry straight away. You probably want to append to a string in this loop, and return that afterwards.
I have this function:
import csv
myfile = r'csvlist.csv'
with open(myfile, 'r', newline='') as f:
c = csv.reader(f, delimiter=',')
i = next(c).index('Wasted Years')
filtering = [row for row in c if row[i] == '25']
total = sum(float(row["Prices"]) for row in c)
print(filtering, "The total is %s" % total)
The filtering part works well, but the total one, it is supposed to iterate over some items on the column, and output a total from these items, but it prints 0 for some reason, any ideas?
i iterates over a int column, which has data on each cell, like: 25, 18, 30, etc, and filters by a specific number, in this case '25'
total it's supposed to sum everything on Prices column and output a total, these are float-like records
First Solution
c is a csv.reader object, it is also an iterable: you can iterate through it only once. The first time you iterate through c is when you calculate filtering. After that, c becomes empty. One way to deal with this is to create two iterables: c1 and c2 that are identical:
import itertools
import csv
with open(myfile, 'r', newline='') as f:
c = csv.reader(f, delimiter=',')
c1, c2 = tee(c) # Split into 2 separate iterables
# Use the first iterable, c1
i = next(c1).index('Wasted Years')
filtering = [row for row in c1 if row[i] == '25']
# Use a different iterable, c2
next(c2) # Skip the header row
total = sum(float(row["Prices"]) for row in c2)
print(filtering, "The total is %s" % total)
Second Solution
Another solution is to rewind the file pointer to the beginning before iterating through c the second time:
import csv
myfile = r'csvlist.csv'
with open(myfile, 'r', newline='') as f:
c = csv.reader(f, delimiter=',')
i = next(c).index('Wasted Years')
filtering = [row for row in c if row[i] == '25']
f.seek(0) # Rewind the file to the beginning
next(c) # Skip the header row
total = sum(float(row["Prices"]) for row in c)
print(filtering, "The total is %s" % total)
Third Solution
I found out what you and I did wrong: The first time calculating the filtering, we use csv.reader, but the second time when calculating the total, we treated the reader as if it was a csv.DictReader. Let's use csv.DictReader all the way through:
import csv
myfile = r'csvlist.csv'
with open(myfile, 'r', newline='') as f:
c = csv.DictReader(f)
filtering = [row for row in c if row['Wasted Years'] == '25']
# Rewind and skip header
f.seek(0)
next(f)
total = sum(float(row["Prices"]) for row in c)
print(filtering, "The total is %s" % total)
I have a feeling that you want to calculate the total from the filtering, not the whole csv rows. If that it the case:
total = sum(float(row["Prices"]) for row in filtering) # filtering, not c
I have to access the Nth line in a CSV file.
Here's what I did:
import csv
the_file = open('path', 'r')
reader = csv.reader(the_file)
N = input('What line do you need? > ')
i = 0
for row in reader:
if i == N:
print("This is the line.")
print(row)
break
i += 1
the_file.close()
...but this does not feel optimal. Edit for precision: If the file is huge, I do not want to go through all the lines and I do not want to have to load the whole file into memory.
I do hope something like reader[N] exists, but I have not found it.
Edit for answer: This line (coming from chosen answer) is what I was looking for:
next(itertools.islice(csv.reader(f), N, None)
You can use enumerate to iterate through the list until you find the right row:
for i, row in enumerate(reader):
if i == line_number:
print("This is the line.")
print(row)
break
You can also use itertools.islice which is designed for this type of scenario - accessing a particular slice of an iterable without reading the whole thing into memory. It should be a bit more efficient than looping through the unwanted rows.
def get_csv_line(path, line_number):
with open(path) as f:
return next(itertools.islice(csv.reader(f), line_number, None))
But if your CSV file is small, just read the entire thing into a list, which you can then access with an index in the normal way. This also has the advantage that you can access several different rows in random order without having to reset the csv reader.
with open(path) as f:
my_csv_data = list(csv.reader(f))
print(my_csv_data[line_number])
Your solution is actually not that bad. Advancing the file iterator to the line you want is a good approach and is used in many situations like this.
If you want it more concise though, you can use next and enumerate with a generator expression:
import csv
the_file = open('path', 'r')
reader = csv.reader(the_file)
N = int(input('What line do you need? > '))
line = next((x for i, x in enumerate(reader) if i == N), None)
print(line)
the_file.close()
The None in there is what will be returned if the line is not found (N is too large). You can pick any other value though.
You could also open the file with a with-statement to have it be automatically closed:
import csv
with open('path', 'r') as the_file:
reader = csv.reader(the_file)
N = int(input('What line do you need? > '))
line = next((x for i, x in enumerate(reader) if i == N), None)
print(line)
If you really want to cut down on size, you could do:
from csv import reader
N = int(input('What line do you need? > '))
with open('path') as f:
print(next((x for i, x in enumerate(reader(f)) if i == N), None))
The itertools module has a number of functions for creating specialized iterators — and its islice() function could be used to easily solve this problem:
import csv
import itertools
N = 5 # desired line number
with open('path.csv', newline='') as the_file:
row = next(csv.reader(itertools.islice(the_file, N, N+1)))
print("This is the line.")
print(row)
P.S. For the curious, my initial response — which also works (arguably better) — was:
row = next(itertools.islice(csv.reader(the_file), N, N+1))
You can simply do:
n = 2 # line to print
fd = open('foo.csv', 'r')
lines = fd.readlines()
print lines[n-1] # prints 2nd line
fd.close()
Or even better to utilize less memory by not loading entire file into memory:
import linecache
n = 2
linecache.getline('foo.csv', n)
You could minimize your for loop into a comprehension expression, e.g.
row = [row for i,row in enumerate(reader) if i == N][0]
# or even nicer as seen in iCodez code with next and generator expression
row = next(row for i,row in enumerate(reader) if i == N)
import csv
with open('cvs_file.csv', 'r') as inFile:
reader = csv.reader(inFile)
my_content = list(reader)
line_no = input('What line do you need(line number begins from 0)? > ')
if line_no < len(my_content):
print(my_content[line_no])
else:
print('This line does not exists')
As a result now you can get any line by its index directly:
What line do you need? > 2
['101', '0.19', '1']
What line do you need? > 100
This line does not exists
I am working on a data analysis using a CSV file that I got from a datawarehouse(Cognos). The CSV file has the last row that sums up all the rows above, but I do not need this line for my analysis, so I would like to skip the last row.
I was thinking about adding "if" statement that checks a column name within my "for" loop like below.
import CSV
with open('COGNOS.csv', "rb") as f, open('New_COGNOS.csv', "wb") as w:
#Open 2 CSV files. One to read and the other to save.
CSV_raw = csv.reader(f)
CSV_new = csv.writer(w)
for row in CSV_raw:
item_num = row[3].split(" ")[0]
row.append(item_num)
if row[0] == "All Materials (By Collection)": break
CSV_new.writerow(row)
However, this looks like wasting a lot of resource. Is there any pythonian way to skip the last row when iterating through CSV file?
You can write a generator that'll return everything but the last entry in an input iterator:
def skip_last(iterator):
prev = next(iterator)
for item in iterator:
yield prev
prev = item
then wrap your CSV_raw reader object in that:
for row in skip_last(CSV_raw):
The generator basically takes the first entry, then starts looping and on each iteration yield the previous entry. When the input iterator is done, there is still one line left, that is never returned.
A generic version, letting you skip the last n elements, would be:
from collections import deque
from itertools import islice
def skip_last_n(iterator, n=1):
it = iter(iterator)
prev = deque(islice(it, n), n)
for item in it:
yield prev.popleft()
prev.append(item)
A generalized "skip-n" generator
from __future__ import print_function
from StringIO import StringIO
from itertools import tee
s = '''\
1
2
3
4
5
6
7
8
'''
def skip_last_n(iterator, n=1):
a, b = tee(iterator)
for x in xrange(n):
next(a)
for line in a:
yield next(b)
i = StringIO(s)
for x in skip_last_n(i, 1):
print(x, end='')
1
2
3
4
5
6
7
i = StringIO(s)
for x in skip_last_n(i, 3):
print(x, end='')
1
2
3
4
5