Python: numpy, how to len of csv

Python: numpy, how to len of csv - python

I have 2 files:
res_my_inp.csv : http://ge.tt/8Fa4d512/v/0?c
res_my_out.csv : http://ge.tt/8Fa4d512/v/1?c
I want to get matrix from res_my_inp.csv and array from res_my_out.csv (get only 1 column)
My code:
import csv
in_filepath = os.path.abspath( "res_my_inp.csv" )
out_filepath = os.path.abspath( "res_my_out.csv" )
def ReadCsv( fileName, mode ):
in_file = open( fileName )
reader = csv.reader( in_file, delimiter = '\t', quotechar = '"' )
if ( mode == 'input' ):
data = [[]]
for row in reader:
data.append( row )
elif( mode == 'output' ):
column = 1
data = []
for row in reader:
data.append( int( row[column] ) )
data.pop(0)
return data
X = np.asarray( ReadCsv( in_filepath, 'input' ), 'int' )
Y = np.asarray( ReadCsv( out_filepath, 'output' ), 'int' )
print(len(X))
print(len(Y))
Why len(X) = 100, but len(Y) = 99 ?

The length discrepancy is because this line:
data = [[]]
is different from this line:
data = []
In the 'input' branch you've got an empty list at the start of your data list that isn't present in the 'output' branch.

Related

IndexError: index 4 is out of bounds for axis 0 with size 4

Hey I am having this Index Error where I am trying to composite events but my indices start at 0 and not 1 and while have tried to do a number of things like trying to .append[i+1] I am unable to fix this error I am having.
Theres apparently something wrong with this specific line of code : dset_IDX[offset:offset_next] = event_id[file_indices]
While the .py file is over a 1000 lines of code so I can not show all of it I am able to show the part of the function that is giving me the error
def count_events(files):
# Because we want to remove events with 0 hits,
# we need to count the events beforehand (to create the h5 file).
# This function counts and indexes the events with more than 0 hits.
# Files need to be iterated in the same order to use the indexes.
""" This is where we manually specify the file"""
num_events = 0
nonzero_file_events = []
for file_index, f in enumerate(files):
data = np.load(f, allow_pickle=True)
nonzero_file_events.append([])
hits = data['digi_hit_pmt']
for i in range(len(hits)):
if len(hits[i]) != 0:
nonzero_file_events[file_index].append(i)
num_events += 1
return num_events, nonzero_file_events
def GenMapping(csv_file):
mPMT_to_index = {}
with open(csv_file) as f:
rows = f.readline().split(",")[1:]
rows = [int(r.strip()) for r in rows]
for line in f:
line_split = line.split(",")
col = int(line_split[0].strip())
for row, value in zip(rows, line_split[1:]):
value = value.strip()
if value: # If the value is not empty
mPMT_to_index[int(value)] = [col, row]
npmap = np.zeros((max(mPMT_to_index) + 1, 2), dtype=np.int)
for k, v in mPMT_to_index.items():
npmap[k] = v
return npmap
def GenerateMultiMuonSample_h5(avg_mu_per_ev=2.5, sigma_time_offset=21.2):
"""
Inputs:
avg_mu_per_ev == Poisson distribution mean for number of muons in each spill
sigma_time_offset == Width of spill (Gaussian) in nanoseconds
"""
files = ['event998.npz']
# Remove whitespace
files = [x.strip() for x in files]
# Check that files were provided
if len(files) == 0:
raise ValueError("No files provided!!")
print("Merging " + str(len(files)) + " files")
# Start merging
num_nonzero_events, nonzero_event_indexes = count_events(files)
print(num_nonzero_events)
# np.random.poisson( avg_mu_per_ev, number_of_throws )
num_muons = np.random.poisson(avg_mu_per_ev, num_nonzero_events - 2954)
# creates h5 file to generate the h5 file
dtype_events = np.dtype(np.float32)
dtype_labels = np.dtype(np.int32)
dtype_energies = np.dtype(np.float32)
dtype_positions = np.dtype(np.float32)
dtype_IDX = np.dtype(np.int32)
dtype_PATHS = h5py.special_dtype(vlen=str)
dtype_angles = np.dtype(np.float32)
# sets h5 file to be written
h5_file = h5py.File('multimuonfile(2).h5', 'w')
dset_event_data = h5_file.create_dataset("event_data",
shape=(num_nonzero_events,) + IMAGE_SHAPE,
dtype=dtype_events)
dset_labels = h5_file.create_dataset("labels",
shape=(num_nonzero_events,),
dtype=dtype_labels)
dset_energies = h5_file.create_dataset("energies",
shape=(num_nonzero_events, 1),
dtype=dtype_energies)
dset_positions = h5_file.create_dataset("positions",
shape=(num_nonzero_events, 1, 3),
dtype=dtype_positions)
dset_IDX = h5_file.create_dataset("event_ids",
shape=(num_nonzero_events,),
dtype=dtype_IDX)
dset_PATHS = h5_file.create_dataset("root_files",
shape=(num_nonzero_events,),
dtype=dtype_PATHS)
dset_angles = h5_file.create_dataset("angles",
shape=(num_nonzero_events, 2),
dtype=dtype_angles)
# 22 -> gamma, 11 -> electron, 13 -> muon
# corresponds to labelling used in CNN with only barrel
# IWCDmPMT_4pi_full_tank_gamma_E0to1000MeV_unif-pos-R371-y521cm_4pi-dir_3000evts_329.npz has an event
# with pid 11 though....
# pid_to_label = {22:0, 11:1, 13:2}
offset = 0
offset_next = 0
mPMT_to_index = GenMapping(PMT_LABELS)
# Loop over files
for file_index, filename in enumerate(files):
data = np.load(filename, allow_pickle=True)
nonzero_events_in_file = len(nonzero_event_indexes[file_index])
x_data = np.zeros((nonzero_events_in_file,) + IMAGE_SHAPE,
dtype=dtype_events)
digi_hit_pmt = data['digi_hit_pmt']
# digi_hit_charge = data['digi_hit_charge']
# digi_hit_time = data['digi_hit_time']
# digi_hit_trigger = data['digi_hit_trigger']
# trigger_time = data['trigger_time']
delay = 0
# Loop over events in file
# Loop over number of muons in each event
event_id = np.array([], dtype=np.int32)
root_file = np.array([], dtype=np.str)
pid = np.array([])
position = np.array([])
direction = np.array([])
energy = np.array([])
labels = np.array([])
# with open("ResultFile.txt", "w") as text_file:
# sys.stdout = open("Result2.txt", "w")
for i, nmu in enumerate(num_muons):
# np.savetxt(text_file, i, nmu,fmt="%d")
# text_file.write("processing output entry " + str(i) + " with " + nmu + " muons")
print("processing output entry ", i, " with ", nmu, " muons")
indices = np.random.randint(0, len(digi_hit_pmt), max(1, nmu))
time_offs = [0.]
if nmu > 1:
time_offs = np.append(time_offs, np.random.normal(0., sigma_time_offset, nmu - 1))
hit_pmts, charge, time = SumEvents(indices, time_offs, data, nmu == 0)
hit_mpmts = hit_pmts // 19
pmt_channels = hit_pmts % 19
rows = mPMT_to_index[hit_mpmts, 0]
cols = mPMT_to_index[hit_mpmts, 1]
x_data[i - delay, rows, cols, pmt_channels] = charge
x_data[i - delay, rows, cols, pmt_channels + 19] = time
# fix below!!!
idx0 = indices[0]
event_id = np.append(event_id, data['event_id'][idx0])
root_file = np.append(root_file, data['root_file'][idx0])
pid = np.append(pid, data['pid'][idx0])
position = np.append(position, data['position'][idx0])
direction = np.append(direction, data['direction'][idx0])
energy = np.append(energy, np.sum(data['energy'][indices]))
labels = np.append(labels, nmu)
offset_next += nonzero_events_in_file
file_indices = nonzero_event_indexes[file_index]
dset_IDX[offset:offset_next] = event_id[file_indices]
dset_PATHS[offset:offset_next] = root_file[file_indices]
dset_energies[offset:offset_next, :] = energy[file_indices].reshape(-1, 1)
dset_positions[offset:offset_next, :, :] = position[file_indices].reshape(-1, 1, 3)
dset_labels[offset:offset_next] = labels[file_indices]
print(event_id)
direction = direction[file_indices]
polar = np.arccos(direction[:, 1])
azimuth = np.arctan2(direction[:, 2], direction[:, 0])
dset_angles[offset:offset_next, :] = np.hstack((polar.reshape(-1, 1), azimuth.reshape(-1, 1)))
dset_event_data[offset:offset_next, :] = x_data
offset = offset_next
print("Finished file: {}".format(filename))
#sys.stdout.close()
print("Saving")
#h5_file.close()
print("Finished")
# In[ ]:
GenerateMultiMuonSample_h5(avg_mu_per_ev=2.5, sigma_time_offset=21.2)
Traceback
Merging 1 files
2958
processing output entry 0 with 3 muons
processing output entry 1 with 1 muons
processing output entry 2 with 3 muons
processing output entry 3 with 3 muons
Traceback (most recent call last):
File "C:/Users/abdul/OneDrive/Desktop/ISSP/ISSP-AA/TriumfCNN-AA/EventDisplay.py", line 1068, in <module>
GenerateMultiMuonSample_h5(avg_mu_per_ev=2.5, sigma_time_offset=21.2)
File "C:/Users/abdul/OneDrive/Desktop/ISSP/ISSP-AA/TriumfCNN-AA/EventDisplay.py", line 1044, in GenerateMultiMuonSample_h5
dset_IDX[offset:offset_next] = event_id[file_indices]
IndexError: index 4 is out of bounds for axis 0 with size 4

not much info is provided but what i have understood,
the error says that axis 0 has size=4 and you are trying to access index 4 which is not possible with size 4 as it starts with 0 and max index could be 3.

append dictionaries from csv file

The following code applies one update to my project.
tagPath = ["Package_PLC/Tags/CCN_CNV01_MX001_A_FLT"]
alarmConfig = {"BLD_CHN01_VS001_A_FLT_C":[["enabled","Value","0"]]}
system.tag.editAlarmConfig(tagPaths, alarmConfig)
I need to do this hundreds of times.
I am trying to build tagPath and alarmConfig dictionaries from a csv file.
Sample csv:
Equipment,Item Name,Alarm Tag,Alarm Name,Cluster Name,Category,Alarm Desc,Delay,Help,Comment,Variable Tag A,Variable Tag B,Custom 1,Custom 2,Custom 3,Custom 4,Custom 5,Custom 6,Custom 7,Custom 8,Paging,Paging Group,Area,Privilege,Historize,Project,SEQUENCE,TAGGENLINK,EDITCODE,LINKED
"","","BLD_CHN01_VS001_A_FLT_C","BLD_CHN01_VS001_A_FLT_C","","","Catch-up Conveyor / Chain Comms Fault","00:00:00","","BLD_CHN01_VS001_A_FLT_C","BLD_CHN01_VS001_A_FLT_C","KFS_ZNE02_WRM","STUN","","","","","","","","","","1","","","","","","",""
"","","BLD_CHN01_VS001_A_FLT_V","BLD_CHN01_VS001_A_FLT_V","","","Catch-up Conveyor / Chain VSD Fault","00:00:00","","BLD_CHN01_VS001_A_FLT_V","BLD_CHN01_VS001_A_FLT_V","","STUN","","","","","","","","","","1","","","","","","",""
"","","BLD_CHN01_VS001_S_HTY","BLD_CHN01_VS001_S_HTY","","","Catch-up Conveyor / Chain Cicuit Breaker","00:00:00","","BLD_CHN01_VS001_S_HTY","NOT BLD_CHN01_VS001_S_HTY","KFS_ZNE02_WRM","STUN","","","","","","","","","","1","","","","","","",""
This is what I have so far:
import system
import csv
path = system.file.openFile('csv')
if path != None:
print "path found"
f=open(path)
reader = csv.DictReader(f)
path1 = "Package_PLC/Tags/"
tagpath = []
alarmConfig = []
state = 0
comment = ""
for i in reader:
if row['Alarm Tag'] == 'ECN*' || 'FCN*' || 'PAC*':
tagpath.append(path1 + int(row['Alarm Tag']))
alarmname = row[Alarm Tag]
if row[Variable Tag A] == "NOT*":
state = 0
else:
state = 1
comment = row[Alarm Desc]
alarmConfig.append({alarmname: [["setpointA","Value",state],
["displayPath","Value","Packing"],
["notes","Value",comment]]
})
system.tag.editAlarmConfig(tagPaths, alarmConfig)
f.close()
The following error gets thrown.
Traceback (most recent call last):
File "<buffer>", line 28, in <module>
TypeError: list indices must be integers

This worked.
import string
import system
import csv
path = system.file.openFile('csv')
if path != None:
print "path found"
f=open(path)
reader = csv.DictReader(f)
path1 = "Package_PLC/Tags/"
tagpath = []
alarmConfig = {}
state = 0
readerlist = list(reader)
for stuff in readerlist:
if "PAC" in stuff['Alarm Tag'] or "ECN" in stuff['Alarm Tag'] or "CCN" in stuff['Alarm Tag'] or "FCN" in stuff['Alarm Tag'] :
tagpath = []
tagpath.append(str( path1 + stuff['Alarm Tag']))
if "NOT" in stuff['Variable Tag A']:
state = 0
else :
state = 1
display = ['displayPath','Value','Packing']
notes = ['notes','Value',str(stuff['Alarm Desc'])]
setpointA =['setpointA','Value', str(state)]
alarmConfig = {}
alarmConfig[stuff['Alarm Tag']] = [display,notes,setpointA]
system.tag.editAlarmConfig(tagpath, alarmConfig)
f.close()

It's difficult to help you because:
The sample file doesn't trigger anything
You didn't provide the system module
But still here's my attempt:
import os.path
import csv
input_file_name = 'Sample.csv'
if os.path.exists(input_file_name):
with open(input_file_name, newline='') as input_file:
events = csv.DictReader(input_file)
data_extracted = [
(
current_event['Alarm Tag'],
0 if current_event['Variable Tag A'].startswith('NOT') else 1,
current_event['Alarm Desc']
)
for current_event in events
if current_event['Alarm Tag'][:3] in ('ECN', 'FCN', 'PAC')
]
tag_paths = [f'Package_PLC/Tags/{x[0]}' for x in data_extracted]
alarm_config = {
alarm_name: [
['setpointA', 'Value', state],
['displayPath', 'Value', 'Packing'],
['notes', 'value', comment]
]
for (alarm_name, state, comment) in data_extracted
}
system.tag.editAlarmConfig(tag_paths, alarm_config)

Problems porting code from Python 2.7 to 3.6

I have a fragment of code which loads data from a .csv file.
It's written for Python 2.7 but in Python 3.6 does not work.
def load_new_data(self):
full = list()
with open(self.filename, 'rb') as csv_in:
myreader2 = csv.reader(csv_in, delimiter=';')
count = 0
for row in myreader2:
if count == 0:
headers = row[1:]
count += 1
elif count == 1:
count += 1
else:
current_row = row[1:-1]
full.append(current_row)
count += 1
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print '\t Removing incomplete variables.'
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df
the error I get is:
212
213 count = 0
--> 214 for row in myreader2:
215 if count == 0:
216 headers = row[1:]
Error: iterator should return strings, not bytes (did you open the file in
text mode?)
I did try changing the 'rb' to 'r' and 'rt' and even deleting it, as other posts here suggest, but with no success...

try this
def load_new_data(self):
full = list()
with open(self.filename, 'r') as csv_in:
myreader2 = csv.reader(csv_in, delimiter=';')
count = 0
for row in myreader2:
if count == 0:
headers = row[1:]
count += 1
elif count == 1:
count += 1
else:
current_row = row[1:-1]
full.append(current_row)
count += 1
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print ('\t Removing incomplete variables.')
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df

You should try codecs, for open file. Be careful this file encoding.
Sample:
def load_new_data(self):
with codecs.open(self.filename, 'rb', encoding="cp1251") as csv_in: # cp1251 replace for your encoding!
myreader2 = csv.reader(csv_in, delimiter=';')
headers = next(myreader2)[1:]
next(myreader2)
full = [row[1:] for row in myreader2]
new_df = pd.DataFrame.from_records(full, columns=headers)
new_df = new_df.iloc[1:, :80]
self.fill_in_blanks(new_df)
new_df = dp.remove_inc_variables(new_df, .1)
print('\t Removing incomplete variables.')
for i in new_df.columns:
try:
new_df.loc[:, i] = new_df.loc[:, i].astype(float)
except:
pass
return new_df

My function to extract totals is exhausting my input file for future reading

The client includes 3 rows at the bottom that contain totals for me to reconcile against in my program. Only problem is that my program is exhausting the input file with readlines() before it can do anything else. Is there a way to keep the file from being exhausted during my get_recon_total function call?
#!/usr/bin/env python
# pre_process.py
import csv
import sys
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(in_obj)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
# Create a list to contain section header information
header_list = create_header_list(reader_list)
# Create dictionary that contains header list as the key,
# then all rows that match as a list of dictionaries.
master_dict = map_data(header_list, reader_list)
# Write data to processed file, create recon counts to compare
# to footer record
tot_cnt, rec_cnt, erec_cnt = write_data(master_dict, outfile, fieldnames)
print tot_cnt, rec_cnt, erec_cnt
def open_reader(file_obj):
'''
Uses DictReader from the csv module to take the first header line
as the fieldnames, then applies them to each element in the file.
Returns the DictReader object and the fieldnames being used (used
later when data is printed out with DictWriter.)
'''
reader = csv.DictReader(file_obj, delimiter=',')
return reader, reader.fieldnames
def create_header_list(in_obj):
p_id_list = []
for row in in_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) not in p_id_list:
p_id_list.append((row['PEOPLE_ID'], row['DON_DATE']))
return p_id_list
def map_data(header_list, data_obj):
master_dict = {}
client_section_list = []
for element in header_list:
for row in data_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) == element:
client_section_list.append(row)
element = list(element)
element_list = [client_section_list[0]['DEDUCT_AMT'],
client_section_list[0]['ND_AMT'],
client_section_list[0]['DEDUCT_YTD'],
client_section_list[0]['NONDEDUCT_YTD']
]
try:
element_list.append((float(client_section_list[0]['DEDUCT_YTD']) +
float(client_section_list[0]['NONDEDUCT_YTD'])
))
except ValueError:
pass
element.extend(element_list)
element = tuple(element)
master_dict[element] = client_section_list
client_section_list = []
return master_dict
def write_data(in_obj, outfile, in_fieldnames):
with open(outfile, 'wb') as writer_outfile:
writer = csv.writer(writer_outfile, delimiter=',')
dict_writer = csv.DictWriter(writer_outfile,
fieldnames=in_fieldnames,
extrasaction='ignore')
tot_cnt = 0
rec_cnt = 0
email_cnt = 0
for k, v in in_obj.iteritems():
writer_outfile.write(' -01- ')
writer.writerow(k)
rec_cnt += 1
for i, e in enumerate(v):
if v[i]['INT_CODE_EX0006'] != '' or v[i]['INT_CODE_EX0028'] != '':
email_cnt += 1
writer_outfile.write(' -02- ')
dict_writer.writerow(e)
tot_cnt += 1
return tot_cnt, rec_cnt, email_cnt
def get_recon_totals(in_obj):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in in_obj.readlines():
line = line.split(',')
if line[0] == 'T' and line[1] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Receipt Count':
print 'Receipt Count found.'
client_rec_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Email Receipt Count':
print 'E-Receipt Count Found.'
client_erec_cnt = line[2]
return client_tot_cnt, client_rec_cnt, client_erec_cnt
if __name__ == '__main__':
main()

If your file is not very large, you can convert reader generator to a list of dcitonary , by calling list() on reader and then use it in your code instead of trying to read from the file directly.
Example -
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
reader_list = list(reader)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(reader_list)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader_list, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
.
.
def get_recon_totals(reader_list):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in reader_list: #line here is a dict
if line[<fieldname for first column>] == 'T' and line[<fieldname for secondcolumn>] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[<fieldname for third column>]
.
. #continued like above
.
return client_tot_cnt, client_rec_cnt, client_erec_cnt

Writing values CSV file

I am trying to write some output to csv from my code below. First column should have all of the valid IDs with a header that says “Valid (count in parenthesis)”. The second column should contain a list of all of the non-valid IDs and have a header that says “Non-valid (count in parenthesis)”. Any idea how I do this?
import csv
# csv_path = r'C:\temp\data\fileA'
csv_path = r'C:\temp\data\fileA'
reader = csv.reader(open(csv_path, 'r'), dialect='excel-tab')
reader.next() # ignore heading
min_id = 1503332138
max_id = 1503632138
valid_ids = []
invalid = []
x = 0
for line in reader:
pv = line[1]
if id.isdigit() and int(id) >= min_id and int(id) <= max_id:
if id not in valid_ids:
valid_ids.append(id)
else:
if id not in invalid:
invalid.append(id)
print 'Valid IDs (',len(valid_ids),')'
for valid in valid_ids:
print valid
print 'Invalid IDs (',len(invalid),')'
for invalid in invalid:
print invalid

# ...
# Continuing from point where you have valid_ids and invalid lists populated
data = [('Valid IDs', valid_ids), ('Invalid IDs', invalid)]
# Create header
header = []
for (label, id_list) in data:
label_with_count = '%s (%d)' % (label, len(id_list))
header.append(label_with_count)
# Write to CSV file
with open('path_to_output_file.csv') as out_csv_file:
csv_writer = csv.writer(out_csv_file)
csv_writer.writerow(header)
for (idx, dataset) in enumerate(data):
(label, id_list) = dataset
for id in id_list:
row = (idx * ['']) + [id] + ((len(data) - idx - 1) * [''])
csv_writer.writerow(row)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python: numpy, how to len of csv - python

The length discrepancy is because this line: data = [[]] is different from this line: data = [] In the 'input' branch you've got an empty list at the start of your data list that isn't present in the 'output' branch.

Related

IndexError: index 4 is out of bounds for axis 0 with size 4

append dictionaries from csv file

Problems porting code from Python 2.7 to 3.6

My function to extract totals is exhausting my input file for future reading

Writing values CSV file

Categories

Resources