How to use threads for huggingface transformers - python

I'm trying to run a hugging face model, mode exactly "cardiffnlp/twitter-roberta-base-sentiment" on threads. But at the same time, I want just one single instance of it because it's really costly in terms of time.
In other words, I have multiple CSV files (several thousand) and each of them has around 20k-30k lines and I want that each line from all of them to be executed by the huggingface model, as you probably can imagine already this is the reason why I don't want to instantiate a model for each thread (where each thread would be used just to read one line and write it in the database).
The problem with my approach is that when I'm running the code is going to give me an error from huggingface model.
RuntimeError: Already borrowed
Could any of you help me to understand how cand I fix it?
Hugging face model:
class EmotionDetection(object):
def __init__(self, model_name="cardiffnlp/twitter-roberta-base-sentiment"):
self.model_name = model_name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
self.classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True,
task="sentiment-analysis", device=0)
def get_emotion_by_label(self, label: str):
if label == "LABEL_0":
return "negative"
elif label == "LABEL_1":
return "neutral"
elif label == "LABEL_2":
return "positive"
else:
print("SOMETHING IS WRONG")
return ""
def get_emotion(self, phrase):
results = self.classifier(phrase)
res = dict()
for result in results:
for emotion in result:
res.update({self.get_emotion_by_label(emotion['label']): emotion['score']})
return res
My code for generating database:
class GenerateDbThread(object):
def __init__(self, text: str, created_at: datetime.datetime, get_emotion_function, cursor, table_name):
self.table_name = table_name
self.text = text
self.created_at = created_at
emotions = get_emotion_function(self.text)
self.pos = emotions['positive']
self.neg = emotions['negative']
self.neu = emotions['neutral']
self.cursor = cursor
def execute(self):
query = f"INSERT INTO {self.table_name}(date, positive, negative, neutral, tweet) " \
f"VALUES (datetime('{str(self.created_at)}'),{self.pos},{self.neg},{self.neu}, '{self.text}')"
self.cursor.execute(query)
self.cursor.commit()
def get_all_data_files_path(data_dir: str):
return [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
def run(file: str, table_name: str):
df = pd.read_csv(os.path.join('data', file), delimiter=',')
for index, row in df.iterrows():
text = row['tweet']
language = row['language']
split_data = row['created_at'].split(" ")
GTB_Time = f"{split_data[2]} {split_data[3]} {split_data[4]}"
created_at = datetime.datetime.strptime(row['created_at'], f"%Y-%m-%d %H:%M:%S {GTB_Time}")
if language == "en":
GenerateDbThread(text, created_at, emotion_detector.get_emotion, cursor, table_name)
def init_db(db_name, table_name):
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
cursor.execute(f"""
CREATE TABLE IF NOT EXISTS {table_name} (
uid INTEGER PRIMARY KEY AUTOINCREMENT,
date DATETIME NOT NULL,
positive REAL NOT NULL,
negative REAL NOT NULL,
neutral REAL NOT NULL,
text TEXT NOT NULL
)""")
cursor.execute(f"CREATE INDEX IF NOT EXISTS ix_tweets_index ON {table_name}(uid)")
cursor.close()
ex = ThreadPoolExecutor(max_workers=10)
files = get_all_data_files_path('data')
init_db("DB_NAME.db", "TABLE_NAME")
emotion_detector = EmotionDetection()
conn = sqlite3.connect("DB_NAME.db")
cursor = conn.cursor()
pbar = tqdm(total=len(files))
futures = [ex.submit(run, file, "TABLE_NAME") for file in files]
for future in futures:
res = future.result()
pbar.update(1)
pbar.close()

Related

How to query data from a function

The function queries transactions from a database. I'm able to get them to work individually. Is there a way to make the function query the data based on say the customer name AND the date range without making another "if" statement?
def query_data(c, dateRange=None, customer_name=None, customer_id=None, customer_date=None):
if customer_name is not None:
query_where = 'customer_name=?'
query_args = [customer_name]
query_where = f'{query_where}'
if customer_id is not None:
query_where = 'customer_name=(SELECT customer_name FROM customers WHERE customer_id=?)'
query_args = [customer_id]
query_where = f'{query_where}'
if customer_date is not None:
query_where = 'order_date=?'
query_args = [customer_date]
query_where = f'{query_where}'
if dateRange is not None:
split_date = dateRange.split(",")
query_where = 'order_date BETWEEN DATE(?) AND DATE(?)'
query_args = [split_date[0], split_date[1]]
query_where = f'{query_where}'
query_string = f"SELECT * FROM transactions WHERE {query_where}"
c.execute(query_string, query_args)
return c.fetchall()

Django select_for_update function in concurrent process

When I testing the function of update_or_create in multi threading condition, I found the result is not what I wanted,they created more than one record in MySQL. As the code show, update_or_create used select .. for update to lock rows in MySQL, then it should be only one record in MySQL. I used SQLAlchemy and row sql has proved that.
So, is the Django codes wrong?
with Django code:
def get_or_create_ins():
p, created = OxalicAcid.objects.update_or_create(defaults={"formula": "20", "degree": "80"}, name="smart")
def run():
for i in range(10):
t = threading.Thread(target=get_or_create_ins, args=())
t.start()
if __name__ == "__main__":
# more than one record will be created
run()
with SQLAlchemy code:
#contextmanager
def transaction_atomic():
session = Session()
try:
yield session
session.commit()
except Exception as e:
session.rollback()
raise e
def get_result_with_update(session, name):
sql = text("""
select * from acid_oxalicacid where name = :name for update
""")
params = dict(name=name)
cursor = session.execute(sql, params)
result = cursor.fetchall()
return result
def get_result(session, name):
sql = text("""
select * from acid_oxalicacid where name = :name
""")
params = dict(name=name)
cursor = session.execute(sql, params)
result = cursor.fetchall()
return result
def create_data(session, name, degree, formula):
sql = text("""
insert into acid_oxalicacid (name, degree, formula) values (:name, :degree, :formula)
""")
params = dict(
name=name,
degree=degree,
formula=formula
)
session.execute(sql, params)
def get_or_create():
name = "smart"
degree = "50"
formula = "100"
with transaction_atomic() as session:
res = get_result_with_update(session, name)
if not res:
create_data(session, name, degree, formula)
res = get_result(session, name)
return res
if __name__ == "__main__":
# Only one record be created, that's correct
for i in range(10):
t = threading.Thread(target=get_or_create, args=())
t.start()
Because Django use the 'read committed' transaction isolation level, so it will be multiply records, if change it to 'repeatable read', it's will be only one record in database.

Empty user search returns entire database

Context
I've written a python script designed to run on a server. The script accepts a user input to make a search. I.E the user types in an arbitrary string and the database returns all usernames with a similar string.
Description of problem
I'm very uncertain about the security of the input. The program uses a stored procedure and a parameterised procedure, but despite this, if a user types in nothing i.e a blank string or if they enter something like % then the script returns every single username in the database.
Code
import json
import mysql.connector
class json_read():
def __init__(self,name):
self.name = name
def json_caller(self):
with open(self.name) as f:
f = json.load(f)[0]
return f
f = json_read("database_connect.json")
config = f.json_caller()
def mysql_connect(func):
def wrapper(*args, **kwargs):
try:
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute(*args, **kwargs)
result = cursor.fetchall()
print("\nConnection is stable # " + func.__name__)
print(result)
except:
print("\nConnection failed # " + func.__name__)
return wrapper
class query_dbh():
f2 = json_read("queries.json")
def __init__(self, index):
self.index = self.f2.json_caller()[index]
#mysql_connect
def query(*args, **kwargs):
pass
search_query = query_dbh("Search_uid").index
search_param = [raw_input("Search: ") + "%"]
query(search_query,(search_param))
Queries are kept in a JSON file and loaded by the script
[
{
"Select_names" : "SELECT first,last FROM user",
"Select_id" : "SELECT id FROM user",
"Order_names" : "SELECT first, last FROM user ORDER BY first ASC",
"Search_uid" : "SELECT uid FROM user WHERE uid LIKE %s"
}
]
Where Search_uid is the query being used.

Need Python Programming Tips

I'm learning python since last few weeks. For better learning, I decided to work on some project. So here is my Class for MySQL connection and demo example as well. Can you please tell me. What other improvement can be possible for following code?
Structure?
What else I can do to optimize code?
And Please forgive. If I'm doing some silly mistakes in code. (I'm learning)
#!/usr/bin/python
import pymysql
# select (table, parameter)
# insert (table, data)
# update (table, id, data)
# delete (table, id)
class MySQL:
def __init__(self):
self.sort_by = ""
self.order = ""
# initiate database connection.
self.connection = pymysql.connect(host='localhost',
user='root',
password='',
db='sherlock',
charset='utf8mb4')
self.cursor = self.connection.cursor(pymysql.cursors.DictCursor)
# this function is for selecting any feild on any table.(feilds veriable is optinal)
def select(self, table, *feilds):
flds = "" #differnt name for feilds veriable.
if not feilds:
flds = '*'
else:
for f in feilds:
if not flds:
flds = f
else:
flds += ",`%s`" % f
sql = "SELECT %s FROM `%s` " % (flds, table)
if self.sort_by:
sql = sql +"order by "+ str(self.sort_by) +" "+ str(self.order)
print sql
self.cursor.execute(sql)
result = self.cursor.fetchall()
return result
# This function is for data sorting for Mysql; but optinal.
# example : SELECT * FROM `users` order by id asc
def order_by(self, sort_by="", order="", *args, **kwargs):
self.sort_by = sort_by
self.order = order
# this function is for closing Mysql connection
def close(self):
self.connection.close()
########### END OF MySQL CLASS #############
sql = MySQL()
# sql.order_by function should be called before the sql.select() function.
sql.order_by("email")
# this will select all the feilds from `users` table.
# you can specify whichever feilds you want to return. like : sql.select("users", "id, email")
result = sql.select("users", "password")
for email in result:
print email["password"]
sql.close()

MySQL in Python complaining about placeholders

I've been trying to use python's MySQLdb to execute SQL on a MySQL Database from SSH on my webhost. This program i wrote (on a mac) should print a table, but it doesn't.
Here's my code:
import feedparser
import time
import MySQLdb
topnews = []
politics = []
# tech = []
sports = []
world = []
mostread = []
business = []
feeds = [topnews, mostread, politics, world, sports, business]
d = feedparser.parse('http://feeds.reuters.com/reuters/topNews/.rss') #Just to keep other cells functioning.
def refresh():
global d
global topnews
global politics
# global tech
global sports
global world
global mostread
global business
topnews = feedparser.parse('http://feeds.reuters.com/reuters/topNews/.rss')
politics = feedparser.parse('http://feeds.reuters.com/reuters/PoliticsNews/.rss')
# tech = feedparser.parse('http://feeds.reuters.com/reuters/technologyNews/.rss')
sports = feedparser.parse('http://feeds.reuters.com/reuters/sportsNews/.rss')
world = feedparser.parse('http://feeds.reuters.com/reuters/worldNews/.rss')
mostread = feedparser.parse('http://feeds.reuters.com/reuters/mostRead/.rss')
business = feedparser.parse('http://feeds.reuters.com/reuters/businessNews/.rss')
global feeds
global d
feeds = [topnews, mostread, politics, world, sports, business]
d = feedparser.parse('http://feeds.reuters.com/reuters/topNews/.rss') #Just to keep other cells functioning.
refresh()
def summarize(feed, num): #Define a method called "summarize"
summary = feed['entries'][num]['summary_detail']['value'] #Make a variable equal to the summary
newsummary = "" #The summary we are trying to make, which is empty so far.
for char in summary: #Keep running the following code as many times as there are characters in summary.
if char == "<": #If the current character is a less than sign,
return newsummary #We can finally show our new summary! Mission Accomplished!!!!!!!
else: #Otherwise,
newsummary = newsummary + char #Add the current character to our new summary.
return newsummary.replace(firstword(summarize(topnews, 0)), "").replace("- ", "")
def identify(feed):
term = feed['entries'][0]['tags'][0]['term']
if term == mostread['entries'][0]['tags'][0]['term']:
return "Most Read"
elif term == topnews['entries'][0]['tags'][0]['term']:
return "Top News"
elif term == politics['entries'][0]['tags'][0]['term']:
return "Politics"
# elif term == tech['entries'][0]['tags'][0]['term']:
# return "Tech"
elif term == sports['entries'][0]['tags'][0]['term']:
return "Sports"
elif term == world['entries'][0]['tags'][0]['term']:
return "World"
elif term == business['entries'][0]['tags'][0]['term']:
return "Business"
def firstword(string):
word = ""
for char in string:
if char == "-":
return word
else:
word = word + char
def cat(feed, num):
spec = identify(feed)
if firstword(summarize(feed, num)) != "(Reuters)":
spec = spec + ", " + firstword(summarize(feed, num))
return spec#.replace("(Reuters)")
def link(feed, num):
return d['entries'][num]['link'] #Gives the link to the specified number article.
def date(feed):
return d['entries'][0]['published']
############################################################################################################################################# Coding Rocks!
# Open database connection
db = MySQLdb.connect("localhost","myusername","mypassword","databasename") # Of course, I included the actual values here.
# prepare a cursor object using cursor() method
cursor = db.cursor()
# Prepare SQL query to INSERT a record into the database.
cursor.execute('''
DROP TABLE IF EXISTS news;
''')
cursor.execute('''
CREATE TABLE news
(
id int unsigned NOT NULL auto_increment,
headline varchar(250) NOT NULL,
summary varchar(5000) NOT NULL,
date varchar(50) NOT NULL,
link varchar(2500) NOT NULL,
imagelink varchar(2500) NOT NULL,
category varchar(50) NOT NULL,
PRIMARY KEY (id)
);
''')
for numelem in range( 0, len(mostread['entries']) - 1):
sqlstring = '''
insert into news (headline, summary, date, link, imagelink, category)
values ("NULLFORNOW", %s, %s, %s, "NULLFORNOW", %s);
''' % ( summarize(mostread, numelem), date(mostread), link(mostread, numelem), cat(mostread, numelem) )
cursor.execute(sqlstring)
# cursor.execute('''
# SELECT * FROM news;
# ''')
# results = cursor.fetchall()
# disconnect from server
db.close()
print "Whoopdeedoo! Program done. :)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
This throws an error:
Traceback (most recent call last):
File "feedparser.py", line 132, in <module>
cursor.execute(sqlstring)
File "/usr/lib64/python2.6/site-packages/MySQLdb/cursors.py", line 173, in execute
self.errorhandler(self, exc, value)
File "/usr/lib64/python2.6/site-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
_mysql_exceptions.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'Brazil (Reuters) - Brazilian presidential candidate Eduardo Campos was killed in' at line 2")
I am sincerely sorry for the poor quality of this question; I am just too sick of this error, and I really don't know where the error is.
Please tell me where the problem is, and, of course, how to fix it.
Thank you, CJ
EDIT:
I tried #metatoaster's suggestion, and now I am getting the error:
feedparser.py:137: Warning: Data truncated for column 'category' at row 1 cursor.execute(sqlstring, data)
If you refer to the documentation you will see that the execute method calls for a separate data argument, not format the entire SQL statement using % as that will introduce errors into the SQL statement. You can try this yourself by printing the sqlstring you generated and send it to MySQL and you will get that same error. Do this instead, as per the documentation.
data = (
summarize(mostread, numelem),
date(mostread),
link(mostread, numelem),
cat(mostread, numelem),
)
cursor.execute(sqlstring, data)
As for your second error, it means that the input data exceeded your length of the field (which you defined to a max of 50 characters). Again print out what you actually tried to input as category to see that it probably is too long of a string, or the wrong string even.

Categories