Python multiprocessing freeze - python

I'm trying to run this code, but it returns an error. I didn't understandig that. What could cause this to occur and how should I fix/troubleshoot the problem?
The code:
import pandas as pd
import spacy
dados = pd.read_csv('treino.csv')
nlp = spacy.load('pt_core_news_sm')
textos_para_tratamento = (titulos.lower() for titulos in dados['title'])
def trata_textos(doc):
tokens_validos = []
for token in doc:
e_valido = not token.is_stop and token.is_alpha
if e_valido:
tokens_validos.append(token.text)
if len(tokens_validos) > 2:
return ' '.join(tokens_validos)
textos_tratados = [trata_textos(doc) for doc in nlp.pipe(textos_para_tratamento,
batch_size= 1000,
n_process = -1)]
The error message:
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child process and you have forgotten to use the proper idiom
in the main module:
if__name__=='__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.

The error message is telling you to reorganize your code to only run once in the main module. Since your code isn't runnable as given, I can only suggest the following reorganization:
import pandas as pd
import spacy
def trata_textos(doc):
tokens_validos = []
for token in doc:
e_valido = not token.is_stop and token.is_alpha
if e_valido:
tokens_validos.append(token.text)
if len(tokens_validos) > 2:
return ' '.join(tokens_validos)
if __name__ == '__main__':
dados = pd.read_csv('treino.csv')
nlp = spacy.load('pt_core_news_sm')
textos_para_tratamento = (titulos.lower() for titulos in dados['title'])
textos_tratados = [trata_textos(doc) for doc in nlp.pipe(textos_para_tratamento,
batch_size= 1000,
n_process = -1)]

Related

Python: PyCUDA ERROR: The context stack was not empty upon module cleanup

I have created a Streamlit App to as a demo of a project on Multilingual Text Classification using mBERT in PyTorch. When I run the app with the command python app.py it works fine but when I try to use Streamlit with the command streamlit run app.py it throws a PyCUDA Error.
Following is the code present in app.py:
import torch
from typing import Text
import streamlit as st
import pandas as pd
from textblob import TextBlob
from inference.inference_onnx import run_onnx_inference
from inference.inference_tensorRT import run_trt_inference
from googletrans import Translator
st.title("LinClass: Multilingual Text Classifier")
input_text = st.text_input('Text:')
####################
# Google Translate API
####################
translator = Translator()
input_text = translator.translate(
input_text,
dest= "en"
)
input_text = input_text.text
####################
#Select Precision and Inference Method
####################
df = pd.DataFrame()
df["lang"] = ["en"]
precision = st.sidebar.selectbox("Select Precision:",
("16 Bit", "32 Bit")
)
inference = st.sidebar.selectbox("Inference Method:",
("ONNX", "TensorRT")
)
if st.button('Show Selected Configuration'):
st.subheader("Selected Configuration:")
st.write("Precision: ", precision)
st.write("Inference: ", inference)
st.subheader("Results")
def result(x):
"""
Function to classify the comment toxicity based on the probability and given threshold
params: x(float) - Probability of Toxicity
"""
if x >= 0.4:
st.write("Toxic")
else:
st.write("Non Toxic")
####################
# Implement Selected Configuration
####################
if precision=="16 Bit":
if inference=="ONNX":
df["comment_text"] = [input_text]
predictions = run_onnx_inference(
onnx_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_lightning_fp16_2GPU.onnx",
stage="inference",
df_test = df
)
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
if inference=="TensorRT":
df["content"] = [input_text]
predictions = run_trt_inference(
trt_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_lightning_fp16_bs16.engine",
stage="inference",
df_test = df
)
predictions = predictions.astype("float32")
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
if precision=="32 Bit":
if inference=="ONNX":
df["comment_text"] = [input_text]
predictions = run_onnx_inference(
onnx_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_fp32.onnx",
stage="inference",
df_test = df
)
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
if inference=="TensorRT":
df["content"] = [input_text]
predictions = run_trt_inference(
trt_model_path = "/workspace/data/multilingual-text-classifier/output models/mBERT_fp32.engine",
stage="inference",
df_test = df
)
predictions = predictions.astype("float32")
predictions = torch.sigmoid(torch.tensor(predictions))
st.write(input_text)
st.write(predictions)
result(predictions)
####################
# Take Feedback
####################
st.subheader("Feedback:")
feedback = st.radio(
"Are you satisfied with the results?",
('Yes', 'No'))
st.write("Thanks for the Feedback!")
Error
-------------------------------------------------------------------
PyCUDA ERROR: The context stack was not empty upon module cleanup.
-------------------------------------------------------------------
A context was still active when the context stack was being
cleaned up. At this point in our execution, CUDA may already
have been deinitialized, so there is no way we can finish
cleanly. The program will be aborted now.
Use Context.pop() to avoid this problem.
-------------------------------------------------------------------
Aborted (core dumped)

Using concurrent.futures within a for statement

I store QuertyText within a pandas dataframe. Once I've loaded all the queries into I want to conduct an analysis again each query. Currently, I have ~50k to evaluate. So, doing it one by one, will take a long time.
So, I wanted to implement concurrent.futures. How do I take the individual QueryText stored within fullAnalysis as pass it to concurrent.futures and return the output as a variable?
Here is my entire code:
import pandas as pd
import time
import gensim
import sys
import warnings
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
fullAnalysis = pd.DataFrame()
def fetch_data(jFile = 'ProcessingDetails.json'):
print("Fetching data...please wait")
#read JSON file for latest dictionary file name
baselineDictionaryFileName = 'Dictionary/Dictionary_05-03-2020.json'
#copy data to pandas dataframe
labelled_data = pd.read_json(baselineDictionaryFileName)
#Add two more columns to get the most similar text and score
labelled_data['SimilarText'] = ''
labelled_data['SimilarityScore'] = float()
print("Data fetched from " + baselineDictionaryFileName + " and there are " + str(labelled_data.shape[0]) + " rows to be evalauted")
return labelled_data
def calculateScore(inputFunc):
warnings.filterwarnings("ignore", category=DeprecationWarning)
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
inp = inputFunc
print(inp)
out = dict()
strEvaluation = inp.split("most_similar ",1)[1]
#while inp != 'quit':
split_inp = inp.split()
try:
if split_inp[0] == 'help':
pass
elif split_inp[0] == 'similarity' and len(split_inp) >= 3:
pass
elif split_inp[0] == 'most_similar' and len(split_inp) >= 2:
for pair in model.most_similar(positive=[split_inp[1]]):
out.update({pair[0]: pair[1]})
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
return out
def main():
with ThreadPoolExecutor(max_workers=5) as executor:
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
#for item in executor.map(calculateScore, arg):
output = executor.map(calculateScore, arg)
return output
if __name__ == "__main__":
fullAnalysis = fetch_data()
results = main()
print(f'results: {results}')
The Python Global Interpreter Lock or GIL allows only one thread to hold control of the Python interpreter. Since your function calculateScore might be cpu-bound and requires the interpreter to execute its byte code, you may be gaining little by using threading. If, on the other hand, it were doing mostly I/O operations, it would be giving up the GIL for most of its running time allowing other threads to run. But that does not seem to be the case here. You probably should be using the ProcessPoolExecutor from concurrent.futures (try it both ways and see):
def main():
with ProcessPoolExecutor(max_workers=None) as executor:
the_futures = {}
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures[future] = i # map future to request
for future in as_completed(the_futures): # results as they become available not necessarily the order of submission
i = the_futures[future] # the original index
result = future.result() # the result
If you omit the max_workers parameter (or specify a value of None) from the ProcessPoolExecutor constructor, the default will be the number of processors you have on your machine (not a bad default). There is no point in specifying a value larger than the number of processors you have.
If you do not need to tie the future back to the original request, then the_futures can just be a list to which But simplest yest in not even to bother to use the as_completed method:
def main():
with ProcessPoolExecutor(max_workers=5) as executor:
the_futures = []
for i in range(len(fullAnalysis)):
text = fullAnalysis['QueryText'][i]
arg = 'most_similar'+ ' ' + text
future = executor.submit(calculateScore, arg)
the_futures.append(future)
# wait for the completion of all the results and return them all:
results = [f.result() for f in the_futures()] # results in creation order
return results
It should be mentioned that code that launches the ProcessPoolExecutor functions should be in a block governed by a if __name__ = '__main__':. If it isn't you will get into a recursive loop with each subprocess launching the ProcessPoolExecutor. But that seems to be the case here. Perhaps you meant to use the ProcessPoolExecutor all along?
Also:
I don't know what the line ...
model = gensim.models.Word2Vec.load('w2v_model_bigdata')
... in function calculateStore does. It may be the one i/o-bound statement. But this appears to be something that does not vary from call to call. If that is the case and model is not being modified in the function, shouldn't this statement be moved out of the function and computed just once? Then this function would clearly run faster (and be clearly cpu-bound).
Also:
The exception block ...
except KeyError as ke:
#print(str(ke) + "\n")
inp = input()
... is puzzling. You are inputting a value that will never be used right before returning. If this is to pause execution, there is no error message being output.
With Booboo assistance, I was able to update code to include ProcessPoolExecutor. Here is my updated code. Overall, processing has been speed up by more than 60%.
I did run into a processing issue and found this topic BrokenPoolProcess that addresses the issue.
output = {}
thePool = {}
def main(labelled_data, dictionaryRevised):
args = sys.argv[1:]
with ProcessPoolExecutor(max_workers=None) as executor:
for i in range(len(labelled_data)):
text = labelled_data['QueryText'][i]
arg = 'most_similar'+ ' '+ text
output = winprocess.submit(
executor, calculateScore, arg
)
thePool[output] = i #original index for future to request
for output in as_completed(thePool): # results as they become available not necessarily the order of submission
i = thePool[output] # the original index
text = labelled_data['QueryText'][i]
result = output.result() # the result
maximumKey = max(result.items(), key=operator.itemgetter(1))[0]
maximumValue = result.get(maximumKey)
labelled_data['SimilarText'][i] = maximumKey
labelled_data['SimilarityScore'][i] = maximumValue
return labelled_data, dictionaryRevised
if __name__ == "__main__":
start = time.perf_counter()
print("Starting to evaluate Query Text for labelling...")
output_Labelled_Data, output_dictionary_revised = preProcessor()
output,dictionary = main(output_Labelled_Data, output_dictionary_revised)
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')

python win32 api document.getElementById Error

import win32com
import win32com.client
import win32gui
import win32con
import pythoncom
def getIEServer(hwnd, ieServer):
if win32gui.GetClassName(hwnd) == 'Internet Explorer_Server':
ieServer.append(hwnd)
if __name__ == '__main__':
#pythoncom.CoInitializeEx(0) # not use this for multithreading
mainHwnd = win32gui.FindWindow('windowclass', 'windowtitle')
if mainHwnd:
ieServers = []
win32gui.EnumChildWindows(mainHwnd, getIEServer, ieServers)
if len(ieServers) > 0:
ieServer = ieServers[0]
msg = win32gui.RegisterWindowMessage('WM_HTML_GETOBJECT')
ret, result = win32gui.SendMessageTimeout(ieServer, msg, 0, 0, win32con.SMTO_ABORTIFHUNG, 1000)
ob = pythoncom.ObjectFromLresult(result, pythoncom.IID_IDispatch, 0)
doc = win32com.client.dynamic.Dispatch(ob)
print doc.url
# doc.all['id'].click()
You can get doc (document object) with the above code
If you try doc.getElementById ("some-id")
I get an error like the one below.
TypeError: getElementById () takes 1 positional argument but 2 were given
It will appear in IE11
Please Help Me T0T~
p.s The type of problem is different from the suggested answer.
I think I need to fix the error in pywin32.

Running for loop in parallel via python

I have a process that loops over a list of IP addresses and returns some information about them. The simple for loop works great, my issue is running this at scale due to Python's Global Interpreter lock (GIL).
My goal is to have this function run in parallel and take full use of my 4 cores. This way when I run 100K of these it won't take me 24 hours via a normal for loop.
After reading others answers on here, particularly this one, How do I parallelize a simple Python loop?, I decided to use joblib. When I run 10 records thru it(example above), it took over 10 minutes to run. This doesn't sound like it's working right. I know there is something i'm doing wrong or not understanding. Any help is greatly appreciated!
import pandas as pd
import numpy as np
import os as os
from ipwhois import IPWhois
from joblib import Parallel, delayed
import multiprocessing
num_core = multiprocessing.cpu_count()
iplookup = ['174.192.22.197',\
'70.197.71.201',\
'174.195.146.248',\
'70.197.15.130',\
'174.208.14.133',\
'174.238.132.139',\
'174.204.16.10',\
'104.132.11.82',\
'24.1.202.86',\
'216.4.58.18']
Normal for loop which works fine!
asn=[]
asnid=[]
asncountry=[]
asndesc=[]
asnemail = []
asnaddress = []
asncity = []
asnstate = []
asnzip = []
asndesc2 = []
ipaddr=[]
b=1
totstolookup=len(iplookup)
for i in iplookup:
i = str(i)
print("Running #{} out of {}".format(b,totstolookup))
try:
obj=IPWhois(i,timeout=15)
result=obj.lookup_whois()
asn.append(result['asn'])
asnid.append(result['asn_cidr'])
asncountry.append(result['asn_country_code'])
asndesc.append(result['asn_description'])
try:
asnemail.append(result['nets'][0]['emails'])
asnaddress.append(result['nets'][0]['address'])
asncity.append(result['nets'][0]['city'])
asnstate.append(result['nets'][0]['state'])
asnzip.append(result['nets'][0]['postal_code'])
asndesc2.append(result['nets'][0]['description'])
ipaddr.append(i)
except:
asnemail.append(0)
asnaddress.append(0)
asncity.append(0)
asnstate.append(0)
asnzip.append(0)
asndesc2.append(0)
ipaddr.append(i)
except:
pass
b+=1
Function to to pass to joblib to run on all cores!
def run_ip_process(iplookuparray):
asn=[]
asnid=[]
asncountry=[]
asndesc=[]
asnemail = []
asnaddress = []
asncity = []
asnstate = []
asnzip = []
asndesc2 = []
ipaddr=[]
b=1
totstolookup=len(iplookuparray)
for i in iplookuparray:
i = str(i)
print("Running #{} out of {}".format(b,totstolookup))
try:
obj=IPWhois(i,timeout=15)
result=obj.lookup_whois()
asn.append(result['asn'])
asnid.append(result['asn_cidr'])
asncountry.append(result['asn_country_code'])
asndesc.append(result['asn_description'])
try:
asnemail.append(result['nets'][0]['emails'])
asnaddress.append(result['nets'][0]['address'])
asncity.append(result['nets'][0]['city'])
asnstate.append(result['nets'][0]['state'])
asnzip.append(result['nets'][0]['postal_code'])
asndesc2.append(result['nets'][0]['description'])
ipaddr.append(i)
except:
asnemail.append(0)
asnaddress.append(0)
asncity.append(0)
asnstate.append(0)
asnzip.append(0)
asndesc2.append(0)
ipaddr.append(i)
except:
pass
b+=1
ipdataframe = pd.DataFrame({'ipaddress':ipaddr,
'asn': asn,
'asnid':asnid,
'asncountry':asncountry,
'asndesc': asndesc,
'emailcontact': asnemail,
'address':asnaddress,
'city':asncity,
'state': asnstate,
'zip': asnzip,
'ipdescrip':asndesc2})
return ipdataframe
run process using all cores via joblib
Parallel(n_jobs=num_core)(delayed(run_ip_process)(iplookuparray) for i in iplookup)

Want to filter out failure Message from my robot framework output files

I want to filter failure messages from output files generated after executing my testcases in Robot Framework. I have tried modules like from robot.api import ExecutionResult but it gives me only only count of Passed and Failed Testcases.
I have also tried other Robot framework Libtraries like import robot.errors to filter out all error messages but didn't get any luck. Below is my code block:
`
#!/usr/bin/python
from robot.api import ExecutionResult
import robot.errors
from robot.result.visitor import ResultVisitor
xmlpath = "<output.xml PATH>"
result = ExecutionResult(xmlpath)
result.configure(stat_config={'suite_stat_level': 2,
'tag_stat_combine': 'tagANDanother'})
stats = result.statistics
print stats.total.critical.failed
print stats.total.critical.passed
print stats.total.critical.passed + stats.total.critical.failed
class FailureCollector(ResultVisitor):
def __init__(self):
self.failures = []
def visit_test(self, test):
if not test.passed:
self.failures += [test]
failure_collector = FailureCollector()
result.visit(failure_collector)
print failure_collector.failures
#the above print gives me all failed testcases as a list Eg: ['test1:My example Testcase1','test2:My example Testcase2' ]`
Any example to get this work done will be very helpful.
I have tried a lot to get my expected output by using Robot Framework APIs but didn't get proper solution. Finally I got my solution by using import xml.etree.ElementTree as ET module. By using xml.etree.ElementTree module I am parsing my robot result.xml file and getting my work done.
`
import xml.etree.ElementTree as ET
import re
tree = ET.parse('<output.xml file Path>')
root = tree.getroot()
testplans = <Testplans as a list>
i = 0
err_dict = {}
for testplan in testplans:
full_err_list = []
err_list = []
for suite_level_1 in root:
try:
if suite_level_1.tag == "suite":
for suite_level_2 in suite_level_1:
if suite_level_2.tag == "suite" and suite_level_2.attrib['name'] == testplan:
for suite_level_3 in suite_level_2:
if suite_level_3.tag == "suite":
for test in suite_level_3:
if test.tag == "test":
for kw_level_5 in test:
if kw_level_5.tag == "kw" and kw_level_5.attrib['name'] == '<specific keyword under which you expect your result(error or Success message >':
for msg in kw_level_5:
if msg.tag == 'msg':
err_str = msg.text
#print err_str
mat = re.match(r'\$\{FinalResult\}\s=\s(.*)',err_str)
if mat and mat.group(1) != 'Succeeded.':
i=i+1
#print mat.group(1), i
err = mat.group(1)
full_err_list.append(err)
if err not in err_list:
err_list.append(err)
except:
print "Errors found"
break
err_dict[testplan] = err_list
print "\n########## "+testplan+" ##########\n"
print "Total no of failures", len(full_err_list)
for err_name in err_list:
print err_name, "===>>", full_err_list.count(err_name)
##The above will print the error name and its count in specific testPlan`

Categories