Separating Values from a file in Python

Separating Values from a file in Python - python

So let's say I have this txt file formatted (value)(space)(value) and there's a second set of numbers separated with a (tab). An example file is given here:
Header
5 5 6 7 8 7 8 9 0 1
7 6 3 4 1 1 3 6 8 1
8 7 4 1 3 1 9 8 5 1
Now I'm using this code to print all the values shown in the txt file:
NEWLINE = "\n"
def readBoardFromFile():
inputFileOK = False
aBoard = []
while (inputFileOK == False):
try:
inputFileName = input("Enter the name of your file: ")
inputFile = open(inputFileName, "r")
print("Opening File " + inputFileName + "for reading")
currentRow = 0
next(inputFile)
for line in inputFile:
aBoard.append([])
for ch in line:
if (ch != NEWLINE):
aBoard[currentRow].append(ch)
currentRow = currentRow + 1
inputFileOK = True
print("Completed reading of file " + inputFileName)
except IOError:
print("Error: File couldn't be opened")
numRows = len(aBoard)
numColumns = len(aBoard[0])
return(aBoard,numRows,numColumns)
def display(aBoard, numRows, numColumns):
currentRow = 0
currentColumn = 0
print("DISPLAY")
while (currentRow < numRows):
currentColumn = 0
while (currentColumn < numColumns):
print("%s" %(aBoard[currentRow][currentColumn]), end="")
currentColumn = currentColumn + 1
currentRow = currentRow + 1
print()
for currentColumn in range (0,numColumns,1):
print("*", end ="")
print(NEWLINE)
def start():
aBoard,numRows,numColumns = readBoardFromFile()
display(aBoard,numRows,numColumns)
start()
Normally when I run this code this is the output:
DISPLAY
5 5 6 7 8 7 8 9 0 1
7 6 3 4 1 1 3 6 8 1
8 7 4 1 3 1 9 8 5 1
*******************
How do I make it so that the output is:
DISPLAY
5 5 6 7 8
7 6 3 4 1
8 7 4 1 3
Only displaying the numbers in the left half?

Perhaps you can try using the csv module and open the file with tab delimiter.
Then assuming you go with the list approach you could only print the first element from each row.
Something like:
import csv
with open("a.txt") as my_file:
reader = csv.reader(my_file, delimiter ='\t')
next(reader) # to skip header if exists
for line in reader:
print(line[0])

From what I can see you aren't taking the tab character into account in your code which is probably why you have the additional characters in your output.
Here's the approach I would take, utilising Python's power when it comes to processing strings.
I would encourage you to use this type of approach when writing Python, as it will make your like much easier.
NEWLINE = "\n"
def read_board_from_file():
input_file_OK = False
a_board = []
while not input_file_OK:
try:
input_file_name = input("Enter the name of your file: ")
with open(input_file_name, "r") as input_file:
# A file-like object (as returned by open())
# can be simply iterated
for line in input_file:
# Skip header line if present
# (not sure how you would want to handle the
# header.
if "header" in line.lower():
continue
# Strip NEWLINE from each line,
# then split the line at the tab character.
# See comment above.
parts = line.strip(NEWLINE).split("\t")
# parts is a list, we are only interested
# in the first bit.
first_part = parts[0]
# Split the left part at all whitespaces.
# I'm assuming that this is what you want.
# A more complex treatment might make sense here,
# depending on your use-case.
entries = first_part.split()
# entries is a list, so we just need to append it
# to the board
a_board.append(entries)
input_file_OK = True
print(f"Completed reading of file {input_file_name}")
except IOError:
print("Error: File {input_file_name} couldn't be opened.")
return a_board
def display_board(a_board):
print("DISPLAY")
longest_row = 0
# a_board is a list of lists,
# no need to keep track of the number of rows and columns,
# we can just iterate it.
for row in a_board:
# row is a list of entries, we can use str.join() to add a space
# between the parts and format the row nicely.
row_str = " ".join(row)
# At the same time we can keep track of the longest row
# for printing the footer later.
len_row_str = len(row_str)
if len_row_str > longest_row:
longest_row = len_row_str
print(row_str)
# The footer is simply the asterisk character
# printed as many times as the longest row.
footer = "*" * longest_row
print(footer, end="")
print(NEWLINE)
def start():
a_board = read_board_from_file()
display_board(a_board)
start()

I'd do this with a bit more separation between inputs, outputs, and data processing. Your input is the name of the file, and a secondary input is the actual file contents. The data processing step is taking the file contents, and returning some internal representation of a collection of boards. The output is displaying the first board.
from typing import Iterable, List
def parse(lines: Iterable[str], board_sep: str = "\t") -> List[List[List[str]]]:
boards = []
for i, line in enumerate(lines):
# list of the same line from each board
board_lines = line.split(board_sep)
if i == 0:
# this only happens once at the start
# each board can be a list of lists. so boards is a list of "list of lists"
# we're going to append lines to each board, so we need some initial setup
# of boards
boards = [[] for _ in range(len(board_lines))]
for board_idx, board_line in enumerate(board_lines):
# then just add each line of each board to the corresponding section
boards[board_idx].append(board_line.split())
return boards
def show_board(board: List[List[str]]) -> None:
for row in board:
print(" ".join(row))
Now we can put that all together. We need to:
Get the filename
Open the file
Filter out the "Header" and any blank lines
Pass the rest of the lines to the parse() function
Get the first board
Print it with the context
from typing import Tuple
def get_board_dimensions(board: List[List[str]]) -> Tuple[int, int]:
""" Returns a tuple of (rows, cols) """
return len(board), len(board[0])
def get_filtered_file(filename: str) -> Iterable[str]:
with open(filename) as f:
for line in f:
if not line or line.lower() == "header":
continue
yield line
def main():
filename = input("Enter the name of your file: ")
filtered_lines = get_filtered_file(filename)
boards = parse(filtered_lines)
# now we can show the first one
b = boards[0]
_, cols = get_board_dimensions(b)
print("DISPLAY")
show_board(b)
print("*" * (2 * cols - 1)) # columns plus gaps

Related

Increment column i element by values inside list

I have a loop that searches keywords in a text file and pastes integers that follow the keywords into an excel file. I want the integers to be in specific cells in the excel file. Is it possible to increment i by the values in i_list rather than always 5 like in the example?
i_list = [5,3,1,1]
def search(file, excel_file):
i = 2
found_keywords = list()
wb = load_workbook(excel_file)
sheets = wb.sheetnames
sheet1 = wb[sheets[0]]
for kwrd in keywords:
for line in file:
if kwrd in line and kwrd not in found_keywords:
found_keywords.append(kwrd)
sheet1.cell(row=3, column=i).value = int(re.search(r"\d+", line).group())
i += 5
elif kwrd in line:
continue
wb.save(excel_file)

If you dont need to cycle through the values of i then you can just create a generator to return you the values of i one by one. I have wrapped the calling of the next value of i in a try block since once you run out of values the code wouldnt know what to do. so we break the loop
i_list = (i for i in [5,3,1,1])
i = 2
for _ in range(10):
print(i)
try:
i += next(i_list)
except StopIteration as si:
print("no more values in i so loop terminating")
break
OUTPUT
2
7
10
11
12
no more values in i so loop terminating
However if you want to cycle thorugh the values of i you can use cycle from itertools module and infintly take the next item from i_list for as long as you need
from itertools import cycle
i_list = cycle([5,3,1,1])
i = 2
for _ in range(10):
print(i)
i += next(i_list)
OUTPUT
2
7
10
11
12
17
20
21
22
27
UPDATE OF YOUR CODE
below is an update of your code based on the fact you said you dont have to cycle. Remember that once you reach the end of i_list your code will not be able to increase i since there are no more values in i_list.
i_list = [5,3,1,1]
i_generator = (i for i in i_list)
def search(file, excel_file):
i = 2
found_keywords = list()
wb = load_workbook(excel_file)
sheets = wb.sheetnames
sheet1 = wb[sheets[0]]
for kwrd in keywords:
for line in file:
if kwrd in line and kwrd not in found_keywords:
found_keywords.append(kwrd)
sheet1.cell(row=3, column=i).value = int(re.search(r"\d+", line).group())
i += next(i_generator)
elif kwrd in line:
continue
wb.save(excel_file)

This code snippet, adjusted from the original, cycles through the values of i_list:
i_list = [5,3,1,1]
def search(file, excel_file):
i = 2
found_keywords = list()
wb = load_workbook(excel_file)
sheets = wb.sheetnames
sheet1 = wb[sheets[0]]
for kwrd in keywords:
for line in file:
if kwrd in line and kwrd not in found_keywords:
for i in i_list: # Update i based on the i-list value
sheet1.cell(row=3, column=i).value = int(re.search(r"\d+", line).group())
elif kwrd in line:
continue
wb.save(excel_file)

How can I merge each two lines of a large text file into a Python list?

I have a .txt file that is split into multiple lines, but each two of these lines I would like to merge into a single line of a list. How do I do that?
Thanks a lot!
What I have is organized like this:
[1 2 3 4
5 6]
[1 2 3 4
5 6 ]
while what I need would be:
[1 2 3 4 5 6]
[1 2 3 4 5 6]

data =[]
with open(r'<add file path here >','r') as file:
x = file.readlines()
for i in range(0,len(x),2):
data.append(x[i:i+2])
new =[' '.join(i) for i in data]
for i in range(len(new)):
new[i]=new[i].replace('\n','')
new_file_name = r'' #give new file path here
with open(new_file_name,'w+') as file:
for i in new:
file.write(i+'\n')

Try This
final_data = []
with open('file.txt') as a:
fdata= a.readlines()
for ln in range(0,len(fdata),2):
final_data.append(" ".join([fdata[ln].strip('\n'), fdata[ln+1].strip('\n')]))
print (final_data)

I feel you can use a regex for solving this scenario :
#! /usr/bin/env python2.7
import re
with open("textfilename.txt") as r:
text_data = r.read()
independent_lists = re.findall(r"\[(.+?)\]",r ,re.DOTALL)
#now that we have got each independent_list we can next work on
#turning it into a list
final_list_of_objects = [each_string.replace("\n"," ").split() for each_string in independent_lists]
print final_list_of_objects
However if you do not want them to be as a list object and rather just want the outcome without the newline characters inbetween the list then:
#! /usr/bin/env python2.7
import re
with open("textfilename.txt") as r:
text_data = r.read()
new_txt = ""
for each_char in text_data:
if each_char == "[":
bool_char = True
elif each_char == "]":
bool_char = False
elif each_char == "\n" and bool_char:
each_char = " "
new_txt += each_char
new_txt = re.sub(r"\s+", " ", new_txt) # to remove multiple space lines between numbers

You can do two things here:
1) If the text file was created by writing using numpy's savetxt function, you can simply use numpy.loadtxt function with appropriate delimiter.
2) Read file in a string and use a combination of replace and split functions.
file = open(filename,'r')
dataset = file.read()
dataset = dataset.replace('\n',' ').replace('] ',']\n').split('\n')
dataset = [x.replace('[','').replace(']','').split(' ') for x in dataset]

with open('test.txt') as file:
new_data = (" ".join(line.strip() for line in file).replace('] ',']\n').split('\n')) # ['[1 2 3 4 5 6]', ' [1 2 3 4 5 6 ]']
with open('test.txt','w+') as file:
for data in new_data:
file.write(data+'\n')
line.rstrip() removes just the trailing newline('\n') from the line.
you need to pass all read and stripped lines to ' '.join(), not
each line itself. Strings in python are sequences to, so the string
contained in line is interpreted as separate characters when passed on
it's own to ' '.join().

Possible to do this more efficiently (turn compact file to sparse)

I have to read in a file line by line that has indices of where a vector has 1's
so for example:
1 3 9 10
means:
0,1,0,1,0,0,0,0,0,1,1
My goal is to write program that will take each line and print out the full vector with the 0's.
I am able to do this with my current program for a few lines:
#create a sparse vector
list_line_sparse = [0] * int(num_features)
#loop over all the lines
for item in lines:
#split the line on spaces
zz = item.split(' ')
#get all ints on a line
d = [int(x.strip()) for x in zz]
#loop over all ints and change index to 1 in sparse vector
for i in d:
list_line_sparse[i]=1
out_file += (', '.join(str(item) for item in list_line_sparse))
#change back to 0's
for i in d:
list_line_sparse[i]=0
out_file +='\n'
f = open('outfile', 'w')
f.write(out_file)
f.close()
The problem is that for a file with a lot of features and lines, my program is very very inefficient - it basically never finishes. Is there anything sticking out that I should change to make it more efficent? (I.e. the 2 for loops)

It would probably be more efficient to write each line of data to your output file as it is generated, rather than building up a huge string in memory.
numpy is a popular Python module that's good for doing bulk operations on numbers. If you start with:
import numpy as np
list_line_sparse = np.zeros(num_features, dtype=np.uint8)
Then, given d as the list of numbers on the current line, you can simply do:
list_line_sparse[d] = 1
to set ALL of those indexes in the array at the same time, no loop required. (At the Python level at least, obviously there's still a loop involved, but it's down in the C implementation of numpy).

It is slowing down because you are doing string concatenation. It is better to work with lists.
Also you could use csv to read your space separated lines in, and to then write each row with commas automatically added:
import csv
num_features = 20
with open('input.txt', 'r', newline='') as f_input, open('output.txt', 'w', newline='') as f_output:
csv_input = csv.reader(f_input, delimiter=' ')
csv_output = csv.writer(f_output)
for row in csv_input:
list_line_sparse = [0] * int(num_features)
for v in map(int, row):
list_line_sparse[v] = 1
csv_output.writerow(list_line_sparse)
So if input.txt contained the following:
1 3 9 10
1 3 9 11
2 7 3 5
Giving you an output.txt containing:
0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0

Too much loops: first, the item.split(), then the for x in zz, then for i in d, then for item in list_line_sparse, and then for i in d again. Strings concatenations could be your most expensive part: the .join and the output +=. And all this for every line.
You could try a "character by character" parsing and writing. Something like this:
#features per line
count = int(num_features)
f = open('outfile.txt', 'w')
#loop over all lines
for item in lines:
#reset the feature
i = 0
#the characters buffer
index = ""
#parse character by character
for character in item:
#if a space or end of line is found,
#and the characters buffer (index) is not empty
if character in (" ", "\r", "\n"):
if index:
#parse the characters buffer
index = int(index)
#if is not the first feature
if i > 0:
#add the separator
f.write(", ")
#add 0's until index
while i < index:
f.write("0, ")
i += 1
#and write 1
f.write("1")
i += 1
#reset the characters buffer
index = ""
#if is not a space or end on line
else:
#add the character to the buffer
index += character
#if the last line didn't end with a carriage return,
#index could be waiting to be parsed
if index:
index = int(index)
if i > 0:
f.write(", ")
while i < index:
f.write("0, ")
i += 1
f.write("1")
i += 1
index = ""
#fill with 0's
while i < count:
if i == 0:
f.write("0")
else:
f.write(", 0")
i += 1
f.write("\n")
f.close()

Let's rework your code into a simpler package that takes better advantage of Python's features:
import sys
NUM_FEATURES = 12
with open(sys.argv[1]) as source, open(sys.argv[2], 'w') as sink:
for line in source:
list_line_sparse = [0] * NUM_FEATURES
indicies = map(int, line.rstrip().split())
for index in indicies:
list_line_sparse[index] = 1
print(*list_line_sparse, file=sink, sep=',')
I revisited this problem with your "more efficiently" in mind. Although the above is more memory efficient, it is a hair slower time-wise. I reconsidered your original and came up with a solution that is less memory efficient but about 2x faster than your code:
import sys
NUM_FEATURES = 12
data = ''
with open(sys.argv[1]) as source:
for line in source:
list_line_sparse = ["0"] * NUM_FEATURES
indicies = map(int, line.rstrip().split())
for index in indicies:
list_line_sparse[index] = "1"
data += ",".join(list_line_sparse) + '\n'
with open(sys.argv[2], 'w') as sink:
sink.write(data)
Like your original solution, it stores all the data in memory and writes it out at the end which is both a disadvantage (memory-wise) and an advantage (time-wise.)
input.txt
1 3 9 10
1 3 9 11
2 7 3 5
USAGE
% python3 test.py input.txt output.txt
output.txt
0,1,0,1,0,0,0,0,0,1,1,0
0,1,0,1,0,0,0,0,0,1,0,1
0,0,1,1,0,1,0,1,0,0,0,0

How can I efficiently store content into a different list based on prefix?

As you can in the code below: a lot of lines are repetitive.
Everything is involved around the four keywords ['copy', 'scale', 'add', 'triad'], is it possible to just write one line of line.startswith and list.append and somehow dynamically substitute the four keywords at the right place?
The file I am reading has the following format (I made up the numbers):
copy: 1 2 3 4
scale: 1 2 3 4
add: 1 2 3 4
triad: 1 2 3 4
All I care about is the first column, which is all the 1s
copy_s = []
scale_s =[]
add_s = []
triad_s = []
with open('./a_file') as f:
for line in f:
line = line.strip().lower()
if line.startswith('copy'):
copy_s.append(float(line.split()[1]))
continue
elif line.startswith('scale'):
scale_s.append(float(line.split()[1]))
continue
elif line.startswith('add'):
add_s.append(float(line.split()[1]))
continue
elif line.startswith('triad'):
triad_s.append(float(line.split()[1]))
continue
P.S. Should I ask this type of question in StackOverflow or Code Review?

You could use a dictionary to store your prefixes and corresponding values:
d = {}
with open('./a_file') as f:
for line in f:
line = line.strip().lower()
prefix, num = line.split(':')[0], float(line.split()[1])
d[prefix] = num
Use a collections.OrderedDict if the order matters.

Define a dynamic variable that chooses which array to actually append to, then change your elif block as follows:
if line.startswith('copy'):
array = copy_s
elif line.startswith('scale'):
array = scale_s
elif line.startswith('add'):
array = add_s
elif line.startswith('triad'):
array = triad_s
endif
array.append(float(line.split()[1]))

Reading large CSV files from nth line in Python (not from the beginning)

I have 3 huge CSV files containing climate data, each about 5GB.
The first cell in each line is the meteorological station's number (from 0 to about 100,000) each station contains from 1 to 800 lines in each file, which is not necessarily equal in all files. For example, Station 11 has 600, 500, and 200 lines in file1, file2, and file3 respectively.
I want to read all the lines of each station, do some operations on them, then write results to another file, then the next station, etc.
The files are too large to load at once in memory, so I tried some solutions to read them with minimal memory load, like this post and this post which include this method:
with open(...) as f:
for line in f:
<do something with line>
The problem with this method that it reads the file from the beginning every time, while I want to read files as follows:
for station in range (100798):
with open (file1) as f1, open (file2) as f2, open (file3) as f3:
for line in f1:
st = line.split(",")[0]
if st == station:
<store this line for some analysis>
else:
break # break the for loop and go to read the next file
for line in f2:
...
<similar code to f1>
...
for line in f3:
...
<similar code to f1>
...
<do the analysis to station, the go to next station>
The problem is that each time I start over to take next station, the for loop would start from the beginning, while I want it to start from where the 'Break' occurs at the nth line, i.e. to continue reading the file.
How can I do it?
Thanks in advance
Notes About the solutions below:
As I mentioned below at the time I posted my answer, I implemented the answer of #DerFaizio but I found it very slow in processing.
After I had tried the generator-based answer submitted by #PM_2Ring I found it very very fast. Maybe because it depends on Generators.
The difference between the two solutions can be noticed by the numbers of processed stations per minutes which are 2500 st/min for the generator based solution, and 45 st/min for the Pandas based solution. where the Generator based solution is >55 times faster.
I will keep both implementations below for reference.
Many thanks to all contributors, especially #PM_2Ring.

The code below iterates over the files line by line, grabbing the lines for each station from each file in turn and appending them to a list for further processing.
The heart of this code is a generator file_buff that yields the lines of a file but which allows us to push a line back for later reading. When we read a line for the next station we can send it back to file_buff so that we can re-read it when it's time to process the lines for that station.
To test this code, I created some simple fake station data using create_data.
from random import seed, randrange
seed(123)
station_hi = 5
def create_data():
''' Fill 3 files with fake station data '''
fbase = 'datafile_'
for fnum in range(1, 4):
with open(fbase + str(fnum), 'w') as f:
for snum in range(station_hi):
for i in range(randrange(1, 4)):
s = '{1} data{0}{1}{2}'.format(fnum, snum, i)
print(s)
f.write(s + '\n')
print()
create_data()
# A file buffer that you can push lines back to
def file_buff(fh):
prev = None
while True:
while prev:
yield prev
prev = yield prev
prev = yield next(fh)
# An infinite counter that yields numbers converted to strings
def str_count(start=0):
n = start
while True:
yield str(n)
n += 1
# Extract station data from all 3 files
with open('datafile_1') as f1, open('datafile_2') as f2, open('datafile_3') as f3:
fb1, fb2, fb3 = file_buff(f1), file_buff(f2), file_buff(f3)
for snum_str in str_count():
station_lines = []
for fb in (fb1, fb2, fb3):
for line in fb:
#Extract station number string & station data
sid, sdata = line.split()
if sid != snum_str:
# This line contains data for the next station,
# so push it back to the buffer
rc = fb.send(line)
# and go to the next file
break
# Otherwise, append this data
station_lines.append(sdata)
#Process all the data lines for this station
if not station_lines:
#There's no more data to process
break
print('Station', snum_str)
print(station_lines)
output
0 data100
1 data110
1 data111
2 data120
3 data130
3 data131
4 data140
4 data141
0 data200
1 data210
2 data220
2 data221
3 data230
3 data231
3 data232
4 data240
4 data241
4 data242
0 data300
0 data301
1 data310
1 data311
2 data320
3 data330
4 data340
Station 0
['data100', 'data200', 'data300', 'data301']
Station 1
['data110', 'data111', 'data210', 'data310', 'data311']
Station 2
['data120', 'data220', 'data221', 'data320']
Station 3
['data130', 'data131', 'data230', 'data231', 'data232', 'data330']
Station 4
['data140', 'data141', 'data240', 'data241', 'data242', 'data340']
This code can cope if station data is missing for a particular station from one or two of the files, but not if it's missing from all three files, since it breaks the main processing loop when the station_lines list is empty, but that shouldn't be a problem for your data.
For details on generators and the generator.send method, please see 6.2.9. Yield expressions in the docs.
This code was developed using Python 3, but it will also run on Python 2.6+ (you just need to include from __future__ import print_function at the top of the script).
If there may be station ids missing from all 3 files we can easily handle that. Just use a simple range loop instead of the infinite str_count generator.
from random import seed, randrange
seed(123)
station_hi = 7
def create_data():
''' Fill 3 files with fake station data '''
fbase = 'datafile_'
for fnum in range(1, 4):
with open(fbase + str(fnum), 'w') as f:
for snum in range(station_hi):
for i in range(randrange(0, 2)):
s = '{1} data{0}{1}{2}'.format(fnum, snum, i)
print(s)
f.write(s + '\n')
print()
create_data()
# A file buffer that you can push lines back to
def file_buff(fh):
prev = None
while True:
while prev:
yield prev
prev = yield prev
prev = yield next(fh)
station_start = 0
station_stop = station_hi
# Extract station data from all 3 files
with open('datafile_1') as f1, open('datafile_2') as f2, open('datafile_3') as f3:
fb1, fb2, fb3 = file_buff(f1), file_buff(f2), file_buff(f3)
for i in range(station_start, station_stop):
snum_str = str(i)
station_lines = []
for fb in (fb1, fb2, fb3):
for line in fb:
#Extract station number string & station data
sid, sdata = line.split()
if sid != snum_str:
# This line contains data for the next station,
# so push it back to the buffer
rc = fb.send(line)
# and go to the next file
break
# Otherwise, append this data
station_lines.append(sdata)
if not station_lines:
continue
print('Station', snum_str)
print(station_lines)
output
1 data110
3 data130
4 data140
0 data200
1 data210
2 data220
6 data260
0 data300
4 data340
6 data360
Station 0
['data200', 'data300']
Station 1
['data110', 'data210']
Station 2
['data220']
Station 3
['data130']
Station 4
['data140', 'data340']
Station 6
['data260', 'data360']

I would suggest to use pandas.read_csv. You can specify the rows to skip using skiprows and also use a reasonable number of rows to load depending on your filesize using nrows
Here is a link to the documentation:
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

I posted the code below before #PM-2Ring posted his solution.
I would like to leave both solutions active:
The #1 solution that depends on Pandas library (by #DerFaizio). :
This solution finished 5450 stations in 120 minutes (about 45 stations/minute)
import pandas as pd
skips =[1, 1, 1] # to skip the header row forever
for station_number in range(100798):
storage = {}
tmax = pd.read_csv(full_paths[0], skiprows=skips[0], header=None, nrows=126000, usecols=[0, 1, 3])
tmin = pd.read_csv(full_paths[1], skiprows=skips[1], header=None, nrows=126000, usecols=[0, 1, 3])
tavg = pd.read_csv(full_paths[2], skiprows=skips[2], header=None, nrows=126000, usecols=[0, 1, 3])
# tmax is at position 0
for idx, station in enumerate(tmax[0]):
if station == station_number:
date_val = tmax[1][idx]
t_val = float(tmax[3][idx])/10.
storage[date_val] = [t_val, None, None]
skips[0] += 1
else:
break
# tmin is at position 1
for idx, station in enumerate(tmin[0]):
# station, date_val, _, val = lne.split(",")
if station == station_number:
date_val = tmin[1][idx]
t_val = float(tmin[3][idx]) / 10.
if date_val in storage:
storage[date_val][1] = t_val
else:
storage[date_val] = [None, t_val, None]
skips[1] += 1
else:
break
# tavg is at position 2
for idx, station in enumerate(tavg[0]):
...
# similar to Tmin
...
pass
station_info = []
for key in storage.keys():
# do some analysis
# Fill the list station_info
pass
data_out.writerows(station_info)
The following solution is the Generator based solution (by #PM-2Ring)
This solution finished 30000 stations in 12 minutes (about 2500 stations/minute)
files = ['Tmax', 'Tmin', 'Tavg']
headers = ['Nesr_Id', 'r_Year', 'r_Month', 'r_Day', 'Tmax', 'Tmin', 'Tavg']
# A file buffer that you can push lines back to
def file_buff(fh):
prev = None
while True:
while prev:
yield prev
prev = yield prev
prev = yield next(fh)
# An infinite counter that yields numbers converted to strings
def str_count(start=0):
n = start
while True:
yield str(n)
n += 1
# NULL = -999.99
print "Time started: {}".format(time.strftime('%Y-%m-%d %H:%M:%S'))
with open('Results\\GHCN_Daily\\Important\\Temp_All_out_gen.csv', 'wb+') as out_file:
data_out = csv.writer(out_file, quoting=csv.QUOTE_NONE, quotechar='', delimiter=',', escapechar='\\',
lineterminator='\n')
data_out.writerow(headers)
full_paths = [os.path.join(source, '{}.csv'.format(file_name)) for file_name in files]
# Extract station data from all 3 files
with open(full_paths[0]) as f1, open(full_paths[1]) as f2, open(full_paths[0]) as f3:
fb1, fb2, fb3 = file_buff(f1), file_buff(f2), file_buff(f3)
for snum_str in str_count():
# station_lines = []
storage ={}
count = [0, 0, 0]
for file_id, fb in enumerate((fb1, fb2, fb3)):
for line in fb:
if not isinstance(get__proper_data_type(line.split(",")[0]), str):
# Extract station number string & station data
sid, date_val, _dummy, sdata = line.split(",")
if sid != snum_str:
# This line contains data for the next station,
# so push it back to the buffer
rc = fb.send(line)
# and go to the next file
break
# Otherwise, append this data
sdata = float(sdata) / 10.
count[file_id] += 1
if date_val in storage:
storage[date_val][file_id] = sdata
else:
storage[date_val]= [sdata, None, None]
# station_lines.append(sdata)
# # Process all the data lines for this station
# if not station_lines:
# # There's no more data to process
# break
print "St# {:6d}/100797. Time: {}. Tx({}), Tn({}), Ta({}) ".\
format(int(snum_str), time.strftime('%H:%M:%S'), count[0], count[1], count[2])
# print(station_lines)
station_info = []
for key in storage.keys():
# key_val = storage[key]
tx, tn, ta = storage[key]
if ta is None:
if tx != None and tn != None:
ta = round((tx + tn) / 2., 1)
if tx is None:
if tn != None and ta != None:
tx = round(2. * ta - tn, 1)
if tn is None:
if tx != None and ta != None:
tn = round(2. * ta - tx, 1)
# print key,
py_date = from_excel_ordinal(int(key))
# print py_date
station_info.append([snum_str, py_date.year, py_date.month, py_date.day, tx, tn, ta])
data_out.writerows(station_info)
del station_info
Thanks for all.

Going with the built-in csv module, you could do something like:
with open(csvfile, 'r') as f:
reader = csv.reader(f, delimiter=',')
for i in range(n):
reader.next()
for row in reader:
print row # Or whatever you want to do here
Where n is the number of lines you want to skip.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.