Why am I getting this KeyError using multiprocessing and pandas? - python

I'm trying to use multiprocessing on a fuzzy matching script I've written, I need to go through 1.4 Billion comparisons and it takes 30+ hours without multiprocessing so I'm attempting to integrate it here.
def fuzzyCompare(data1, data2):
print("Performing Fuzzy Matches...\n")
similarityDf = pd.DataFrame(columns = ["Similarity Ratio", "Id1", Id2])
count = 0
for i in range(len(data1)):
str1 = data1["CompanyName"][i] + "," + data1["Address1"][i] + "," + data1["City"][i] + "," + data1["PostalZip"][i]
str1 = str1.lower().replace(" ","")
for j in range(len(data2)):
str2 = data2["Company"][j] + "," + data2["Physical Street 1"][j] + "," + data2["Physical City"][j] + "," + data2["Physical Postal Code/ZIP"][j]
str2 = str2.lower().replace(" ","")
ratio = fuzz.ratio(str1,str2)
if(ratio > 0):
similarityDf.at[count, "Similarity Ratio"] = str(ratio) + "%"
similarityDf.at[count, "Id1"] = data1["Id1][i]
similarityDf.at[count, "Id2"] = data2["Id2][j]
count = count + 1
print("Performed " + str(len(data1)*len(data2)) + " Fuzzy Comparisons.\n")
return similarityDf
def main():
data1 = readData(excelFile1) *#read excelfile into dataframe*
data2 = readData(excelFile2) *#read excelfile into dataframe*
df_split = np.array_split(data2, 4) *#split data2 into 4*
args = [(data1, df_split[0]),
(data1, df_split[1]),
(data1, df_split[2]),
(data1, df_split[3])]
with mp.Pool(processes=4) as p:
outputData = pd.concat(p.starmap(fuzzyCompare, args))
if __name__ == "__main__":
mp.freeze_support()
main()
I have a print statement at the end of my fuzzyCompare() and it prints the result for only 1 worker then I receive the following error:
multiprocessing.pool.RemoteTraceback
Traceback (most recent call last):
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 47, in starmapstar
return list(itertools.starmap(args[0], args[1]))
File "C:\Users\...\Documents\Code\Python\fuzzyCompare\multiFuzzyCLI.py", line 47, in fuzzyCompare
str2 = data2["Company"][j] + "," + data2["Physical Street 1"][j] + "," + data2["Physical City"][j] + "," + data2["Physical Postal Code/ZIP"][j]
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\series.py", line 1068, in __getitem__
result = self.index.get_value(self, key)
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\indexes\base.py", line 4730, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 88, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 992, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 998, in
pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "multiFuzzyCLI.py", line 145, in <module>
main()
File "multiFuzzyCLI.py", line 132, in main
outputData = pd.concat(p.starmap(fuzzyCompare, args))
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 276, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 657, in get
raise self._value
KeyError: 0
I know what a KeyError is, I just don't understand how it's getting the error in this case.
Thanks

You get a KeyError because you try to index every DataFrame with an index starting from 0, and np.array_split will maintain the original Index of each split.
To properly index the i-th row of a DataFrame you should always use DataFrame.iloc as this works in general for any Index, not necessarily a RangeIndex that begins from 0. So you need to change all of your selections to be of the form:
data2["Company"].iloc[j] # Not data2["Company"][j]
Worked Example
import pandas as pd
import numpy as np
df = pd.DataFrame({'CompanyName': list('abcdefghij')})
df_split = np.array_split(data2, 4)
# For first split this works as we get lucky the index starts from 0
data2 = df_split[0]
for j in range(len(data2)):
print(data2['CompanyName'][j])
# a
# b
# c
# Later slices fail; `df_split[1].index` is RangeIndex(start=3, stop=6, step=1)
data2 = df_split[1]
for j in range(len(data2)):
print(data2['CompanyName'][j])
# KeyError: 0
# Instead properly select with `.iloc`
for j in range(len(data2)):
print(data2['CompanyName'].iloc[j])
# d
# e
# f

Related

how to solve Exception while handling 'read_file' (thonny.plugins.micropython.mp_back.ManagementError: Script produced errors)?

i'm using Thonny to operate on rasberry pi pico with micropython the script is:
def writejson(gps_altitudes, gps_time, baro_altitudes, baro_time):
f = open("samples.json", "w")
f.write('{\n"gps_altitude": [')
result=""
c=0
for element in gps_altitudes:
flag=True
result = result+str(element)
c+=1
if c != len(gps_altitudes):
result += ", "
f.write(result)
result=""
result = result + '],\n"gps_time": ['
c=0
for element in gps_time:
result = result+"["+str(element[0])+", "+ str(element[1])+", "+str(element[2])+"]"
c+=1
if c != len(gps_time):
result += ", "
f.write(result)
result=""
result = result +'],\n"baro_altitude": ['
c=0
for element in baro_altitudes:
flag = True
result = result + str(element)
c+=1
if c != len(baro_altitudes):
result += ", "
f.write(result)
result=""
result = result+'],\n"baro_time": ['
c=0
for element in baro_time:
result = result+"["+str(element[0])+", "+ str(element[1])+", "+str(element[2])+"]"
c+=1
if c != len(baro_time):
result += ", "
f.write(result)
result=""
f.write(']\n}')
f.close()
to resume everything i just created a json file,
the problem is that if the file exceeds a certain lenght that is variable, Thonny gives me this error when i try to open it.
PROBLEM IN THONNY'S BACK-END: Exception while handling 'read_file' (thonny.plugins.micropython.mp_back.ManagementError: Script produced errors).
log is:
Traceback (most recent call last):
File "B:\Thonny\lib\tkinter\__init__.py", line 1921, in __call__
return self.func(*args)
File "B:\Thonny\lib\site-packages\thonny\base_file_browser.py", line 586, in on_double_click
self.open_file(path)
File "B:\Thonny\lib\site-packages\thonny\base_file_browser.py", line 1201, in open_file
get_workbench().get_editor_notebook().show_remote_file(path)
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 1069, in show_remote_file
return self.show_file(make_remote_path(target_filename))
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 1043, in show_file
editor = self.get_editor(filename, True)
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 1134, in get_editor
return self._open_file(filename_or_id)
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 1116, in _open_file
if editor._load_file(filename):
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 206, in _load_file
result = self._load_remote_file(filename)
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 265, in _load_remote_file
raise RuntimeError(response["error"])
RuntimeError: Backend terminated
i don't understand why this happen if someone can help thanks in advance (:

How to convert csv's to line protocol

How do I display a CSV file in line protocol format, like influxdb uses?
My CSV data as below...
time, avg_FreshOrders, p95_FreshOrders, FreshOrders
1593648000000,1479.08407079646,2589,226
1593475200000,2242.8617021276596,5622,188
1593734400000,1682.3375,2738,160
I am using Python to convert to line protocol as below
import pandas as pd
#convert csv's to line protocol
df_full = pd.read_csv("data/FreshOrders.csv")
df_full["measurement"] = ['FO' for t in range(len(df_full))]
lines = [str(df_full["measurement"][d])
+ ",type=FreshOrders"
+ " "
+ "avg_FreshOrders=" + str(df_full["avg_FreshOrders"][d]) + ","
+ "p95_FreshOrders=" + str(df_full["p95_FreshOrders"][d]) + ","
+ "FreshOrders=" + str(df_full["FreshOrders"][d])
+ " " + str(df_full["time"][d]) for d in range(len(df_full))]
#append lines to a text file with DDL & DML:
thefile = open('data/import.txt', 'a+')
for item in lines:
thefile.write("%s\n" % item)
While running this Python code i am getting below errors.
Traceback (most recent call last):
File "csv_to_line.py", line 6, in <module>
lines = [str(df_full["measurement"][d])
File "csv_to_line.py", line 6, in <listcomp>
lines = [str(df_full["measurement"][d])
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\series.py", line 871, in __getitem__
result = self.index.get_value(self, key)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\base.py", line 4405, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 997, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1004, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
Can someone please help me to resolve this issue?
If you have problem with the keyword, you can use index instead.
import pandas as pd
# convert csv's to line protocol
df_full = pd.read_csv("haha.csv")
df_full["measurement"] = ['FO' for t in range(len(df_full))]
lines = []
for idx in range(len(df_full)):
temp = str(df_full.iloc[idx, -1]) + ",type=FreshOrders" + " " + "avg_FreshOrders=" + str(df_full.iloc[idx, 1]) + ","\
+ "p95_FreshOrders=" + str(df_full.iloc[idx, 2]) + "," + "FreshOrders=" + str(df_full.iloc[idx, 3]) + " " + \
str(df_full.iloc[idx, 0])
lines.append(temp)
# append lines to a text file with DDL & DML:
thefile = open('data/import.txt', 'a+')
for item in lines:
thefile.write("%s\n" % item)

File "pandas\_libs\hashtable_class_helper.pxi", line 998, in pandas._libs.hashtable.Int64HashTable.get_item KeyError: 3327

In below code I am trying to get count of u_count and h_count, but everytime i get the keyerror
import numpy as np
import pandas as pd
import matplotlib
import re
import datetime
pattern = '^(([0-2]?[0-9]/[0-9]?[0-9]/[0-9][0-9]), ([0-9]?[0-9]:[0-9][0-9]\s\w{2}) - (\w+\s\w+|\w+|):( [\w ]+))'
def startsWithDateTime(pattern,s):
result = re.match(pattern, s)
if result:
return True
return False
def getDataPoint(pattern,s):
result = re.match(pattern, s)
date = result[2]
time = result[3]
author = result[4]
message = result[5]
return date, time, author, message
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
conversationPath ="WhatsApp_Chat_with_Umesh.txt" # text file
with open(conversationPath, encoding="utf-8") as fp:
fp.readline()
messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
while True:
line = fp.readline()
if not line: # Stop reading further if end of file has been reached
break
line = line.strip() # Guarding against erroneous leading and trailing whitespaces
if startsWithDateTime(pattern,line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
parsedData.append([date, time, author, ' '.join(messageBuffer)]) # Save the tokens from the previous message in parsedData
messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
date, time, author, message = getDataPoint(pattern,line) # Identify and extract tokens from the line
messageBuffer.append(message) # Append message to buffer
else:
messageBuffer.append(line)
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message'])
def count(df):
df['Letter_count'] = df['Message'].apply(lambda s: len(s))
df['Word_count'] = df['Message'].apply(lambda s: len(s.split()))
# count(df)
# print(df.head(50))
# print(df['Date'][0])
temp =0
i = 0
h_count= 0
u_count = 0
while True:
temp = df['Date'][i]
filter =df[df['Date']==temp]
data = filter.iloc[0]
# print(data.loc['Author'])
# print(type(data))
if data.loc['Author'] == 'Umesh Yadav':
u_count +=1
else:
h_count +=1
i =i+1
Error Log: (whatsup_env)
L:\whatsup_chat_analyzer\WhatsApp-Chat-Analyzer> (whatsup_env)
L:\whatsup_chat_analyzer\WhatsApp-Chat-Analyzer>C:/Users/Harish/Anaconda3/python.exe
l:/whatsup_chat_analyzer/WhatsApp-Chat-Analyzer/analyzer.py Traceback
(most recent call last): File
"l:/whatsup_chat_analyzer/WhatsApp-Chat-Analyzer/analyzer.py", line
66, in
temp = df['Date'][i] File "C:\Users\Harish\Anaconda3\lib\site-packages\pandas\core\series.py",
line 1068, in getitem
result = self.index.get_value(self, key) File "C:\Users\Harish\Anaconda3\lib\site-packages\pandas\core\indexes\base.py",
line 4730, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None)) File "pandas_libs\index.pyx", line 80, in
pandas._libs.index.IndexEngine.get_value File
"pandas_libs\index.pyx", line 88, in
pandas._libs.index.IndexEngine.get_value File
"pandas_libs\index.pyx", line 131, in
pandas._libs.index.IndexEngine.get_loc File
"pandas_libs\hashtable_class_helper.pxi", line 992, in
pandas._libs.hashtable.Int64HashTable.get_item File
"pandas_libs\hashtable_class_helper.pxi", line 998, in
pandas._libs.hashtable.Int64HashTable.get_item KeyError: 3327

KeyException in Python Pandas

I am receiving the following error related to a Key Error. I have a large data set (in the realm of 10 million records) and I am trying to filter only the records that contain a key word in the 'tags' field. I am able to compare for matching key words easily, but to parse for including key words seems to be quite difficult and any of the methods I have tried that I found on SO throw an error. I am new to Pandas so please forgive me if I am committing a cardinal sin. (Took BigData in university and we worked mostly in Spark. I realize the code is a bit hacky right now just trying to get it to function)
Notes: 1 The data is stored across quarterly files so I am iterating over the files and con-concatenating the results (which is the reason for the index and the counter) 2 I commented out the lines that allow me to parse for exact matches (#is_goodwill = data_frame['tag'] == word_of_interest && #good_will_relation = data_frame[is_goodwill])
Goal: Filter for records containing key word word_of_interest
It does not have to be an exact match to the key word, but rather contain the keyword. Code is below the error
Error
Traceback (most recent call last):
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2525, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'tags'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "parsePandasSECData.py", line 64, in <module>
main()
File "parsePandasSECData.py", line 42, in main
good_will_relation = data_frame[data_frame['tags'].str.contains(word_of_interest)]
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2139, in __getitem__
return self._getitem_column(key)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2146, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\generic.py", line 1842, in _get_item_cache
values = self._data.get(item)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\internals.py", line 3843, in get
loc = self.items.get_loc(item)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2527, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'tags'
Code
import pandas as pd
import os.path
import time
import csv
def main():
start_time = time.time()
file_path = "C:/Users/TA/Desktop/Data/"
word_of_interest = "ShareholdersEquity"
NUM_FILE_NAME = "number.csv"
SUB_FILE_NAME = "subnumber.csv"
quarterly_list = ['Q1', 'Q2', 'Q3', 'Q4']
all_concat_data = None
pd.set_option('display.max_row', 1000)
for counter in range(9,19):
for index in range(len(quarterly_list)):
#iterates over all file locations
num_file_path = file_path + quarterly_list[index] + str(counter) + '/' + NUM_FILE_NAME
sub_file_path = file_path + quarterly_list[index] + str(counter) + '/' + SUB_FILE_NAME
if os.path.exists(num_file_path) and os.path.exists(sub_file_path):
print('Starting ' + quarterly_list[index] + str(counter) + ' Data')
#Load data
data_frame = pd.read_csv(num_file_path, dtype={'adsh': str, 'tag': str, 'version coreg': str, 'ddate': int, 'qtrs': int, 'uom': str, 'value': float, 'footnote': str}, \
header=0, delimiter='\t', low_memory= False, encoding= 'ISO-8859-1')
#Comparative Data
transaction_descriptions = pd.read_csv(sub_file_path, dtype={'adsh': str}, header = 0, delimiter = '\t', low_memory=False, encoding='ISO-8859-1')
#is_goodwill = data_frame['tag'] == word_of_interest
#good_will_relation = data_frame[is_goodwill]
good_will_relation = data_frame[data_frame['tags'].str.contains(word_of_interest)]
captured_data = good_will_relation.merge(transaction_descriptions, how='inner', left_on='adsh', right_on='adsh')
if all_concat_data is not None:
all_concat_data = pd.concat([all_concat_data,captured_data])
else:
all_concat_data = captured_data
else:
print(quarterly_list[index] + str(counter) + ' Does not exist...Skipping')
print('Starting Writer operation')
writer = pd.ExcelWriter('output.xlsx')
all_concat_data.to_excel(writer, 'Sheet1')
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()

Too many indices error coming up in Python

This is my first bash at using python, I am making an array of fathers and children, I have a dataset in a .csv file which I need to go into python so I can later take it over to java script. However the same error message keeps coming which I have put below. Further down is my script. Would be most grateful for any advice!
Gabriella
>>> runfile('/Users/gkountourides/Desktop/FunctionalExperiment/untitled0.py', wdir='/Users/gkountourides/Desktop/FunctionalExperiment')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "//anaconda/lib/python3.5/site-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 714, in runfile
execfile(filename, namespace)
File "//anaconda/lib/python3.5/site-packages/spyderlib/widgets/externalshell/sitecustomize.py", line 89, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/Users/gkountourides/Desktop/FunctionalExperiment/untitled0.py", line 41, in <module>
father,child = fc_csv_to_javascript_arrays("/Users/gkountourides/Desktop/FunctionalExperiment/fatherChild.csv")
File "/Users/gkountourides/Desktop/FunctionalExperiment/untitled0.py", line 38, in fc_csv_to_javascript_arrays
father_str, child_str = from_fc_array_to_fc_string(full_array)
File "/Users/gkountourides/Desktop/FunctionalExperiment/untitled0.py", line 30, in from_fc_array_to_fc_string
father_array = input_2d_array[:,0]
IndexError: too many indices for array
>>>
And then my actual script:
import glob
import numpy as np
def list_all_jpgs():
javascript_string = "["
for filename in glob.glob("*.jpg"):
javascript_string += '"' + filename + '",'
javascript_string = javascript_string[0:-1] + "]"
return javascript_string
def load_into_np_array(input_csv_file):
loaded_array = np.genfromtxt(input_csv_file, dtype=str, delimiter=",")
return loaded_array
def from_single_array_to_string(input_1d_array):
output_string = '['
for entry in input_1d_array:
output_string += '"'+str(entry)+'",'
output_string = output_string[0:-1]+']'
return output_string
def from_fc_array_to_fc_string(input_2d_array):
father_array = input_2d_array[:,0]
child_array = input_2d_array[:,1]
father_string = from_single_array_to_string(father_array)
child_string = from_single_array_to_string(child_array)
return father_string, child_string
def fc_csv_to_javascript_arrays(csv_input):
full_array = load_into_np_array(csv_input)
father_str, child_str = from_fc_array_to_fc_string(full_array)
return father_str, child_str
father,child = fc_csv_to_javascript_arrays("/Users/gkountourides/Desktop/FunctionalExperiment/fatherChild.csv")
print(father)
print(child)
The too many indices error indicates that input_2d_array is not a two 2d array. genfromtxt() is not returning what you are expecting.
numpy.genfromtxt produces array of what looks like tuples, not a 2D array—why?
http://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html

Categories