How to import from CSV - python

I am trying to parse data from several *.csv files and save them as list for later manipulation, but keep failing.
I have read numerous tutorials and related topics on SO and other sites, but couldn't find the solution for my problem. After several days of working on the code, I am stuck and don't know how to proceed.
# saves filepaths of *.csv files in lists (constant)
CSV_OLDFILE = glob.glob("./oldcsv/*.csv")
assert isinstance(CSV_OLDFILE, list)
CSV_NEWFILE = glob.glob("./newcsv/*.csv")
assert isinstance(CSV_NEWFILE, list)
def get_data(input):
"""copies numbers from *.csv files, saves them in list RAW_NUMBERS"""
for i in range(0, 5): # for each of the six files
with open(input[i], 'r') as input[i]: # open as "read"
for line in input[i]: # parse lines for data
input.append(int(line)) # add to list
return input
def write_data(input):
"""writes list PROCESSED_NUMBERS_FINAL into new *.csv files"""
for i in range(0, 5): # for each of the six files
with open(input[i], 'w') as data: # open as "write"
data = csv.writer(input[i])
return data
RAW_NUMBERS = get_data(CSV_OLDFILE)
# other steps for processing data
write_data(PROCESSED_NUMBERS_FINAL)
Actual result:
TypeError: object of type '_io.TextIOWrapper' has no len()
Expected result: save data from *.csv files, manipulate and write them to new *.csv files.
I think the problem is probably located in my trying to call len of a file object, but I don't know what the correct implementation should look like.
Complete backtrace:
Traceback (most recent call last):
File "./solution.py", line 100, in <module>
PROCESSED_NUMBERS = slowsort_start(RAW_NUMBERS)
File "./solution.py", line 73, in slowsort_start
(input[i], 0, len(input[i])-1))
TypeError: object of type '_io.TextIOWrapper' has no len()

Question: Expected result: read data from *.csv, manipulate numbers and write them to new *.csv.
OOP solution that holds the numbers in a dict of dict:list.
Initialize the class object with the in_path and out_path
import os, csv
class ReadProcessWrite:
def __init__(self, in_path, out_path):
self.in_path = in_path
self.out_path = out_path
self.number = {}
Read all files from self.in_path, filter .csv files.
Create a dict with key ['raw'] and assign all numbers from this *.csv to a list.
Note: Assuming, one number per line!
def read_numbers(self):
for fname in os.listdir(self.in_path):
if fname.endswith('.csv'):
self.number[fname] = {}
with open(os.path.join(self.in_path, fname)) as in_csv:
self.number[fname]['raw'] = [int(number[0]) for number in csv.reader(in_csv)]
print('read_numbers {} {}'.format(fname, self.number[fname]['raw']))
return self
Process the ['raw'] numbers and assigen the result to the key ['final'].
def process_numbers(self):
def process(numbers):
return [n*10 for n in numbers]
for fname in self.number:
print('process_numbers {} {}'.format(fname, self.number[fname]['raw']))
# other steps for processing data
self.number[fname]['final'] = process(self.number[fname]['raw'])
return self
Write the results from key ['final'] to self.out_path, using the same .csv filenames.
def write_numbers(self):
for fname in self.number:
print('write_numbers {} {}'.format(fname, self.number[fname]['final']))
with open(os.path.join(self.out_path, fname), 'w') as out_csv:
csv.writer(out_csv).writerows([[row] for row in self.number[fname]['final']])
Usage:
if __name__ == "__main__":
ReadProcessWrite('oldcsv', 'newcsv').read_numbers().process_numbers().write_numbers()
Output:
read_numbers 001.csv [1, 2, 3]
read_numbers 003.csv [7, 8, 9]
read_numbers 002.csv [4, 5, 6]
process_numbers 003.csv [7, 8, 9]
process_numbers 002.csv [4, 5, 6]
process_numbers 001.csv [1, 2, 3]
write_numbers 003.csv [70, 80, 90]
write_numbers 002.csv [40, 50, 60]
write_numbers 001.csv [10, 20, 30]
Tested with Python: 3.4.2

So this is the solution I found, after lots of trial-and-error and research:
# initializing lists for later use
RAW_DATA = [] # unsorted numbers
SORTED_DATA = [] # sorted numbers
PROCESSED_DATA = [] # sorted and multiplied numbers
def read_data(filepath): # from oldfiles
"""returns parsed unprocessed numbers from old *.csv files"""
numbers = open(filepath, "r").read().splitlines() # reads, gets input from rows
return numbers
def get_data(filepath): # from oldfiles
"""fills list raw_data with parsed input from old *.csv files"""
for i in range(0, 6): # for each of the six files
RAW_DATA.append(read_data(filepath[i])) # add their data to list
def write_data(filepath): # parameter: newfile
"""create new *.csv files with input from sorted_data and permission 600"""
for i in range(0, 6): # for each of the six files
with open(filepath[i], "w", newline="\n") as file: # open with "write"
writer = csv.writer(file) # calls method for writing
for item in SORTED_DATA[i]: # prevents data from being treated as one object
writer.writerow([item]) # puts each entry in row
os.chmod(filepath[i], 0o600) # sets permission to 600 (octal)
This lets me read from files, as well as create and write to files. Given that I need a specific setup, with data only ever being found in "column A", I chose this solution. But thanks again to everybody who answered and commented!

Related

Trying to save list data to reuse while testing python scripts

I have a python script that goes out and pulls a huge chunk of JSON data and then iterates it to build 2 lists
# Get all price data
response = c.get_price_history_every_minute(symbol)
# Build prices list
prices = list()
for i in range (len(response.json()["candles"])):
prices.append (response.json()["candles"][i]["prices"])
# Build times list
times = list()
for i in range (len(response.json()["candles"])):
times.append (response.json()["candles"][i]["datetime"])
This works fine, but it takes a LONG time to pull in all of the data and build the lists. I am doing some testing trying to build out a complex script, and would like to save these two lists to two files, and then import the data from those files and recreate the lists when I run subsequent tests to skip generating, iterating and parsing the JSON.
I have been trying the following:
# Write Price to a File
a_file = open("prices7.txt", "w")
content = str(prices)
a_file.write(content)
a_file.close()
And then in future scripts:
# Load Prices from File
prices_test = array('d')
a_file = open("prices7.txt", "r")
prices_test = a_file.read()
The outputs from my json lists and the data loaded into the list created from the file output look identical, but when I try to do anything with the data loaded from a file it is garbage...
print (prices)
{The output looks like this} [69.73, 69.72, 69.64, ... 69.85, 69.82, etc]
print (prices_test)
The output looks identical
If I run a simple query like:
print (prices[1], prices[2])
I get the expected output {69.73, 69.72]
If I do the same on the list created from the file:
print (prices_test[1], prices_test[2])
I get the output ( [,6 )
It is pulling every character in the string individually instead of using the comma separated values as I would have expected...
I've googled every combination of search terms I could think of so any help would be GREATLY appreciated!!
I had to do something like this before. I used pickle to do it.
import pickle
def pickle_the_data(pickle_name, list_to_pickle):
"""This function pickles a given list.
Args:
pickle_name (str): name of the resulting pickle.
list_to_pickle (list): list that you need to pickle
"""
with open(pickle_name +'.pickle', 'wb') as pikd:
pickle.dump(list_to_pickle, pikd)
file_name = pickle_name + '.pickle'
print(f'{file_name}: Created.')
def unpickle_the_data(pickle_file_name):
"""This will unpickle a pickled file
Args:
pickle_file_name (str): file name of the pickle
Returns:
list: when we pass a pickled list, it will return an
unpickled list.
"""
with open(pickle_file_name, 'rb') as pk_file:
unpickleddata = pickle.load(pk_file)
return unpickleddata
so first pickle your list pickle_the_data(name_for_pickle, your_list)
then when you need to load the list unpickle_the_data(name_of_your_pickle_file)
This is what I'm trying to explain into the comments section. Note I replaced response.json() to jsonData, successfully taking it out of each for-loop, and reduced both loops into a single one for more efficiency. Now the code should run faster.
import json
def saveData(filename, data):
# Convert Data to a JSON String
data = json.dumps(data)
# Open the file, then save it
try:
file = open(filename, "wt")
except:
print("Failed to save the file.")
return False
else:
file.write(data)
file.close()
return True
def loadData(filename):
# Open the file, then load its contents
try:
file = open(filename, "rt")
except:
print("Failed to load the file.")
return None
else:
data = file.read()
file.close()
# Data is a JSON string, so now we convert it back
# to a Python Structure:
data = json.loads(data)
return data
# Get all price data
response = c.get_price_history_every_minute(symbol)
jsonData = response.json()
# Build prices and times list:
#
# As you're iterating over the same "candles" index on both loops
# when building those two lists, just reduce it to a single loop
prices = list()
times = list()
for i in range(len(jsonData["candles"])):
prices.append(jsonData["candles"][i]["prices"])
times.append(jsonData["candles"][i]["datetime"])
# Now, when you need, just save each list like this:
saveData("prices_list.json", prices)
saveData("times_list.json", times)
# And retrieve them back when you need it later:
prices = loadData("prices_list.json")
times = loadData("times_list.json")
Btw, pickle does the same thing, but it uses Binary Data instead of json, which is probably faster for save / load data. I don't know, didn't tested it.
In json, you have the advantage of readability, as you can open each file and read it directly, if you can understand JSON syntax.

How can I split csv files in python?

Because of the memory error, i have to split my csv files. I did research it. I found it from one of the stack overflow user who is Aziz Alto. This is his code.
csvfile = open('#', 'r').readlines()
filename = 1
for i in range(len(csvfile)):
if i % 10000000 == 0:
open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+10000000])
filename += 1
It works well but for second file, the code did not add header which is very important for me. My question is that How can I add header for second file?
import pandas as pd
rows = pd.read_csv("csvfile.csv", chunksize=5000000)
for i, chuck in enumerate(rows):
chuck.to_csv('out{}.csv'.format(i)) # i is for chunk number of each iteration
chucksize you specify how many rows you want- in excel you can have upto 1,048,576 rows.
This will save it as 5000000 and with header.
hope this Helps!!
On the 2nd till last file you have to always add the 1st line of your original file (the one containing the header):
# this loads the first file fully into memory
with open('#', 'r') as f:
csvfile = f.readlines()
linesPerFile = 1000000
filename = 1
# this is better then your former loop, it loops in 1000000 lines a peice,
# instead of incrementing 1000000 times and only write on the millionth one
for i in range(0,len(csvfile),linesPerFile):
with open(str(filename) + '.csv', 'w+') as f:
if filename > 1: # this is the second or later file, we need to write the
f.write(csvfile[0]) # header again if 2nd.... file
f.writelines(csvfile[i:i+linesPerFile])
filename += 1
Fast csv file splitting
If you have a very big file and you have to try different partitions (say to find the best way to split it) the above solutions are too slow to try.
Another way to solve this (and a very fast one) is to create an index file by record number. It takes about six minutes to create an index file of a csv file of 6867839 rows and 9 Gb, and an additional 2 minutes for joblib to store it on disk.
This method is particularly impressive if you are dealing with huge files, like 3 Gb or more.
Here's the code for creating the index file:
# Usage:
# creaidx.py filename.csv
# indexes a csv file by record number. This can be used to
# access any record directly or to split a file without the
# need of reading it all. The index file is joblib-stored as
# filename.index
# filename.csv is the file to create index for
import os,sys,joblib
BLKSIZE=512
def checkopen(s,m='r',bz=None):
if os.access(s,os.F_OK):
if bz==None:
return open(s,m) # returns open file
else:
return open(s,m,bz) # returns open file with buffer size
else:
return None
def get_blk():
global ix,off,blk,buff
while True: # dealing with special cases
if ix==0:
n=0
break
if buff[0]==b'\r':
n=2
off=0
break
if off==BLKSIZE-2:
n=0
off=0
break
if off==BLKSIZE-1:
n=0
off=1
break
n=2
off=buff.find(b'\r')
break
while (off>=0 and off<BLKSIZE-2):
idx.append([ix,blk,off+n])
# g.write('{},{},{}\n'.format(ix,blk,off+n))
print(ix,end='\r')
n=2
ix+=1
off= buff.find(b'\r',off+2)
def crea_idx():
global buff,blk
buff=f.read(BLKSIZE)
while len(buff)==BLKSIZE:
get_blk()
buff=f.read(BLKSIZE)
blk+=1
get_blk()
idx[-1][2]=-1
return
if len(sys.argv)==1:
sys.exit("Need to provide a csv filename!")
ix=0
blk=0
off=0
idx=[]
buff=b'0'
s=sys.argv[1]
f=checkopen(s,'rb')
idxfile=s.replace('.csv','.index')
if checkopen(idxfile)==None:
with open(idxfile,'w') as g:
crea_idx()
joblib.dump(idx,idxfile)
else:
if os.path.getctime(idxfile)<os.path.getctime(s):
with open(idxfile,'w') as g:
crea_idx()
joblib.dump(idx,idxfile)
f.close()
Let's use a toy example:
strings,numbers,colors
string1,1,blue
string2,2,red
string3,3,green
string4,4,yellow
The index file will be:
[[0, 0, 0],
[1, 0, 24],
[2, 0, 40],
[3, 0, 55],
[4, 0, 72],
[5, 0, -1]]
Note the -1 at the last index element to indicate end of index file in case of a sequential access. You can use a tool like this to access any individual row of the csv file:
def get_rec(n=1,binary=False):
n=1 if n<0 else n+1
s=b'' if binary else ''
if len(idx)==0:return ''
if idx[n-1][2]==-1:return ''
f.seek(idx[n-1][1]*BLKSIZE+idx[n-1][2])
buff=f.read(BLKSIZE)
x=buff.find(b'\r')
while x==-1:
s=s+buff if binary else s+buff.decode()
buff=f.read(BLKSIZE)
x=buff.find(b'\r')
return s+buff[:x]+b'\r\n' if binary else s+buff[:x].decode()
The first field of the index record is obviously unnecessary. It is kept there for debugging purposes. As a side note, if you substitute this field by any field in the csv record and you sort the index file by that field, then you have the csv file sorted by that field if you use the index field to access the csv file.
Now, once you have you index file created you just call the following program with the filename (the one which index was created already) and a number between 1 and 100 which will be the percentage the file will be split at as command line parameters:
start_time = time.time()
BLKSIZE=512
WSIZE=1048576 # pow(2,20) 1Mb for faster reading/writing
import sys
import joblib
from common import Drv,checkopen
ix=0
blk=0
off=0
idx=[]
buff=b'0'
if len(sys.argv)<3:
sys.exit('Argument missing!')
s=Drv+sys.argv[1]
if sys.argv[2].isnumeric():
pct=int(sys.argv[2])/100
else:
sys.exit('Bad percentage: '+sys.argv[2])
f=checkopen(s,'rb')
idxfile=s.replace('.csv','.index')
if checkopen(idxfile):
print('Loading index...')
idx=joblib.load(idxfile)
print('Done loading index.')
else:
sys.exit(idxfile+' does not exist.')
head=get_rec(0,True)
n=int(pct*(len(idx)-2))
off=idx[n+1][1]*BLKSIZE+idx[n+1][2]-len(head)-1
num=off//WSIZE
res=off%WSIZE
sout=s.replace('.csv','.part1.csv')
i=0
with open(sout,'wb') as g:
g.write(head)
f.seek(idx[1][1]*BLKSIZE+idx[1][2])
for x in range(num):
print(i,end='\r')
i+=1
buff=f.read(WSIZE)
g.write(buff)
buff=f.read(res)
g.write(buff)
print()
i=0
sout=s.replace('.csv','.part2.csv')
with open(sout,'wb') as g:
g.write(head)
f.seek(idx[n+1][1]*BLKSIZE+idx[n+1][2])
buff=f.read(WSIZE)
while len(buff)==WSIZE:
g.write(buff)
print(i,end='\r')
i+=1
buff=f.read(WSIZE)
g.write(buff)
end_time = time.time()
The file are created using blocks of 1048576 bytes. You can play with that figure to make file creation faster or to tailor it to machines with less memory resources.
The file is split only on two partitions, each of them having the header of the original file. It is not too difficult to change the code to make it
split files into more than two partitions.
Finally to put things in perspective, to split a csv file of 6867839 rows and 9 Gb by 50%, it took me roughly 6 minutes to create the index file and another 2 minutes for joblib to store it on disk. It took 3 additional minutes to split the file.

Writing multiple lists to multiple output files

I am working with datasets stored in large text files. For the analysis I am carrying out, I open the files, extract parts of the dataset and compare the extracted subsets. My code works like so:
from math import ceil
with open("seqs.txt","rb") as f:
f = f.readlines()
assert type(f) == list, "ERROR: file object not converted to list"
fives = int( ceil(0.05*len(f)) )
thirds = int( ceil(len(f)/3) )
## top/bottom 5% of dataset
low_5=f[0:fives]
top_5=f[-fives:]
## top/bottom 1/3 of dataset
low_33=f[0:thirds]
top_33=f[-thirds:]
## Write lists to file
# top-5
with open("high-5.out","w") as outfile1:
for i in top_5:
outfile1.write("%s" %i)
# low-5
with open("low-5.out","w") as outfile2:
for i in low_5:
outfile2.write("%s" %i)
# top-33
with open("high-33.out","w") as outfile3:
for i in top_33:
outfile3.write("%s" %i)
# low-33
with open("low-33.out","w") as outfile4:
for i in low_33:
outfile4.write("%s" %i)
I am trying to find a more clever way of automating the process of writing the lists out to files. In this case there are only four, but in the future cases where I may end up with as many as 15-25 lists I would some function to take care of this. I wrote the following:
def write_to_file(*args):
for i in args:
with open(".out", "w") as outfile:
outfile.write("%s" %i)
but the resulting file only contains the final list when I call the function like so:
write_to_file(low_33,low_5,top_33,top_5)
I understand that I have to define an output file for each list (which I am not doing in the function above), I'm just not sure how to implement this. Any ideas?
Make your variable names match your filenames and then use a dictionary to hold them instead of keeping them in the global namespace:
data = {'high_5': # data
,'low_5': # data
,'high_33': # data
,'low_33': # data}
for key in data:
with open('{}.out'.format(key), 'w') as output:
for i in data[key]:
output.write(i)
Keeps your data in a single easy to use place, and assuming you want to apply the same actions to them you can continue using the same paradigm.
As mentioned by PM2Ring below, it would be advisable to use underscores (as you do in the variable names) instead of dashes(as you do in the filenames) as by doing so you can pass the dictionary keys as keyword arguments into a writing function:
write_to_file(**data)
This would equate to:
write_to_file(low_5=f[:fives], high_5=f[-fives:],...) # and the rest of the data
From this you could use one of the functions defined by the other answers.
You could have one output file per argument by incrementing a counter for each argument. For example:
def write_to_file(*args):
for index, i in enumerate(args):
with open("{}.out".format(index+1), "w") as outfile:
outfile.write("%s" %i)
The example above will create output files "1.out", "2.out", "3.out", and "4.out".
Alternatively, if you had specific names you wanted to use (as in your original code), you could do something like the following:
def write_to_file(args):
for name, data in args:
with open("{}.out".format(name), "w") as outfile:
outfile.write("%s" % data)
args = [('low-33', low_33), ('low-5', low_5), ('high-33', top_33), ('high-5', top_5)]
write_to_file(args)
which would create output files "low-33.out", "low-5.out", "high-33.out", and "high-5.out".
Don't try to be clever. Instead aim to have your code readable, easy to understand. You can group repeated code into a function, for example:
from math import ceil
def save_to_file(data, filename):
with open(filename, 'wb') as f:
for item in data:
f.write('{}'.format(item))
with open('data.txt') as f:
numbers = list(f)
five_percent = int(len(numbers) * 0.05)
thirty_three_percent = int(ceil(len(numbers) / 3.0))
# Why not: thirty_three_percent = int(len(numbers) * 0.33)
save_to_file(numbers[:five_percent], 'low-5.out')
save_to_file(numbers[-five_percent:], 'high-5.out')
save_to_file(numbers[:thirty_three_percent], 'low-33.out')
save_to_file(numbers[-thirty_three_percent:], 'high-33.out')
Update
If you have quite a number of lists to write, then it makes sense to use a loop. I suggest to have two functions: save_top_n_percent and save_low_n_percent to help with the job. They contain a little duplicated code, but by separating them into two functions, it is clearer and easier to understand.
def save_to_file(data, filename):
with open(filename, 'wb') as f:
for item in data:
f.write(item)
def save_top_n_percent(n, data):
n_percent = int(len(data) * n / 100.0)
save_to_file(data[-n_percent:], 'top-{}.out'.format(n))
def save_low_n_percent(n, data):
n_percent = int(len(data) * n / 100.0)
save_to_file(data[:n_percent], 'low-{}.out'.format(n))
with open('data.txt') as f:
numbers = list(f)
for n_percent in [5, 33]:
save_top_n_percent(n_percent, numbers)
save_low_n_percent(n_percent, numbers)
On this line you are opening up a file called .out each time and writing to it.
with open(".out", "w") as outfile:
You need to make the ".out" unique for each i in args. you can achieve this by passing in a list as the args and the list will contain the file name and data.
def write_to_file(*args):
for i in args:
with open("%s.out" % i[0], "w") as outfile:
outfile.write("%s" % i[1])
And pass in arguments like so...
write_to_file(["low_33",low_33],["low_5",low_5],["top_33",top_33],["top_5",top_5])
You are creating a file called '.out' and overwriting it each time.
def write_to_file(*args):
for i in args:
filename = i + ".out"
contents = globals()[i]
with open(".out", "w") as outfile:
outfile.write("%s" %contents)
write_to_file("low_33", "low_5", "top_33", "top_5")
https://stackoverflow.com/a/6504497/3583980 (variable name from a string)
This will create low_33.out, low_5.out, top_33.out, top_5.out and their contents will be the lists stored in these variables.

Python: how to save a list with objects in a file?

im trying to create diferent objects (using Clases and objects) and saving them in a file to edit or retrive them later. However this how it looks.
GlobalCategories=[]
GlobalContent=[]
def LoadData(x,y):
import pickle
with open('bin.dat') as f:
x,y = pickle.load(f)
def SaveData(x,y):
import pickle
with open('bin.dat', 'wb') as f:
pickle.dump([x,y], f)
def Loader(x,y):
try:
LoadData(x,y)
except:
SaveData(x,y)
and this the snippet that saves that shows how I save the info the lists (tema is the class and the other stuff are the methods of that class):
newtheme=Tema()
newtheme.setInfo_name(newstr)
newtheme.setInfo_code(newcode)
GlobalCategories.append(newtheme)
SaveData(GlobalContent,GlobalCategories)
X and Y are global lists where I store the objects.(i have noticed that it saves the direction in the memory of each object)
when i first run it, it creates the file and saves the infomation on the file, however if I close it, try to run it again and load the info, the program erases the information, and creates the file again, so anything that was stored is gone.
I dont know if this is a propper way to store objects or if there{s a better way so any advice is very welcome.
#abernert: Thank you abarnert! what I want to do is to save a list with two lists inside. for example one list is going to save the a make (toyota, nisan etc) and the other list the car model(tundra, murano). now each element is an object wich i add to a list when created.
newtheme=Theme()
newtheme.setInfo_name(newstr)
GlobalCategories.append(newtheme)
this is how i save the object in the global list. GlobalCategories is one of those two list i want to load later after i have closed the program (it would be like the list of car companies from the example). Now, where i have the problem is loading the objects from the lists after i have closed and restarted the program, because i am able to retrive and edit them from the list when i have not closed the shell.
I need to load and store the makes and the cars objects in the respective list once i start the program so i can manipulate them later.
Thank you abernert once again!
It's hard to know what the problem is without context of how you are trying to use your LoadData and SaveData functions. However, here is a little demo that does what I think you want.
import pickle
import random
def load_data():
try:
with open("bin.dat") as f:
x, y = pickle.load(f)
except:
x, y = [], []
return x, y
def save_data(data):
with open("bin.dat", "wb") as f:
pickle.dump(data, f)
if __name__ == "__main__":
x, y = load_data()
print x, y
x.append(random.randint(1, 10))
y.append(random.randint(1, 10))
save_data([x, y])
OUTPUT FROM CONSECUTIVE RUNS
[] []
[9] [9]
[9, 10] [9, 9]
[9, 10, 2] [9, 9, 4]
[9, 10, 2, 5] [9, 9, 4, 1]
[9, 10, 2, 5, 6] [9, 9, 4, 1, 9]
[9, 10, 2, 5, 6, 10] [9, 9, 4, 1, 9, 1]
It's hard to be sure, but I'm guessing your problem is that you're writing a binary file, then trying to read it back as text, and you're using Python 2.x on Windows.
In this code:
def LoadData(x,y):
import pickle
with open('bin.dat') as f:
x,y = pickle.load(f)
If you happened to have any LF newline characters in the binary pickle stream, opening the file as text will convert them to CR/LF pairs. This will cause the pickle to be invalid, and therefore it'll raise an exception.
In this code:
def Loader(x,y):
try:
LoadData(x,y)
except:
SaveData(x,y)
… you just swallow any exception and save some empty values.
You probably only want to handle file-not-found errors here (IOError, OSError, or FileNotFoundError, depending on your Python version).
But you definitely want to put the exception into a variable to help debug your problem, like this:
def Loader(x,y):
try:
LoadData(x,y)
except Exception as e:
SaveData(x,y)
You can put a breakpoint on the SaveData line in the debugger, or just add a print(e) line and watch the output, to see why you're getting there.
Meanwhile, even after you fix that, LoadData will never do anything useful. Assigning x,y = pickle.load(f) just rebinds the local variables x and y. The fact that they have the same names as the local variables in Loader doesn't mean that Loader's variables get changed. Neither does the fact that they used to refer to the same values.
Python doesn't have "reference variables" or "output parameters". The normal way to do this is to just return values you want to pass back to the caller:
def LoadData():
import pickle
with open('bin.dat') as f:
x,y = pickle.load(f)
return x,y
And of course Loader has to call it properly:
def Loader(x,y):
try:
x,y = LoadData()
except:
SaveData(x,y)
And you have the exact same problem again in Loader, so you need to fix it again there, and in its caller.

Reading in multiple hdf5 files and appending them to a new dictionary

I have a list of hdf5 files which I would like to open and read in the appropriate values into a new dictionary and eventually write to a text file. I don't necessarily know the values, so the user defines them in an array as an input into the code. The number of files needed is defined by the number of days worth of data the user wants to look at.
new_data_dic = {}
for j in range(len(values)):
new_data_dic[values[j]] = rbsp_ephm[values[j]]
for i in (np.arange(len(filenames_a)-1)+1):
rbsp_ephm = h5py.File(filenames_a[i])
for j in range(len(values)):
new_data_dic[values[j]].append(rbsp_ephm[values[j]])
This works fine if I only have one file, but if I have two or more it seems to close the key? I'm not sure if this is exactly what is happening, but when I ask what new_data_dic is, for values it gives {'Bfs_geo_a': <Closed HDF5 dataset>,... which will not write to a text file. I've tried closing the hdf5 file before opening the next (rbsp_ephm.close()) but I get the same error.
Thanks for any and all help!
I don't really understand your problem... you are trying to create a list of hdf5 dataset?
Or did you just forget the [()] to acces the values in the dataset itself?
Here is a simple standalone example that works just fine :
import h5py
# File creation
filenames_a = []
values = ['values/toto', 'values/tata', 'values/tutu']
nb_file = 5
tmp = 0
for i in range(nb_file):
fname = 'file%s.h5' % i
filenames_a.append(fname)
file = h5py.File(fname, 'w')
grp = file.create_group('values')
for value in values:
file[value] = tmp
tmp += 1
file.close()
# the thing you want
new_data_dict = {value: [] for value in values}
for fname in filenames_a:
rbsp_ephm = h5py.File(fname, 'r')
for value in values:
new_data_dict[value].append(rbsp_ephm[value][()])
print new_data_dict
It returns :
{'values/tutu': [2, 5, 8, 11, 14], 'values/toto': [0, 3, 6, 9, 12], 'values/tata': [1, 4, 7, 10, 13]}
Does it answer your question?
Maybe not directly the good solution, but you could try to extract data as numpy arrays which are a more flexible format rather than the h5py dataset one. See below how to do it:
>>> print type(file['Average/u'])
<class 'h5py.highlevel.Dataset'>
>>> print type(file['Average/u'][:])
<type 'numpy.ndarray'>
And just in case, you should try to use a more "pythonic" way for your loop, that is:
for j in values:
new_data_dic[j] = rbsp_ephm[j]
instead of:
for j in range(len(values)):
new_data_dic[values[j]] = rbsp_ephm[values[j]]

Categories