Check string for specific format of substring, how to..? - python

Two strings. My items name:
Parfume name EDT 50ml
And competitor's items name:
Parfume another name EDP 60ml
And i have a long list of these names in one column, competitors names in other column, and I want to leave only those rows in dataframe, that have same amount of ml in both my and competitors names no matter what everything else in these strings look like. So how do I find a substring ending with 'ml' in a bigger string? I could simply do
"**ml" in competitors_name
to see if they both contain the same amount of ml.
Thank you
UPDATE
'ml' is not always at the end of string. It might look like this
Parfume yet another great name 60ml EDP

Try this:
import re
def same_measurement(my_item, competitor_item, unit="ml"):
matcher = re.compile(r".*?(\d+){}".format(unit))
my_match = matcher.match(my_item)
competitor_match = matcher.match(competitor_item)
return my_match and competitor_match and my_match.group(1) == competitor_match.group(1)
my_item = "Parfume name EDT 50ml"
competitor_item = "Parfume another name EDP 50ml"
assert same_measurement(my_item, competitor_item)
my_item = "Parfume name EDT 50ml"
competitor_item = "Parfume another name EDP 60ml"
assert not same_measurement(my_item, competitor_item)

You could use the python Regex library to select the 'xxml' values for each of your data rows and then do some logic to check if they match.
import re
data_rows = [["Parfume name EDT", "Parfume another name EDP 50ml"]]
for data_pairs in data_rows:
my_ml = None
comp_ml = None
# Check for my ml matches and set value
my_ml_matches = re.search(r'(\d{1,3}[Mm][Ll])', data_pairs[0])
if my_ml_matches != None:
my_ml = my_ml_matches[0]
else:
print("my_ml has no ml")
# Check for comp ml matches and set value
comp_ml_matches = re.search(r'(\d{1,3}[Mm][Ll])', data_pairs[1])
if comp_ml_matches != None:
comp_ml = comp_ml_matches[0]
else:
print("comp_ml has no ml")
# Print outputs
if (my_ml != None) and (comp_ml != None):
if my_ml == comp_ml:
print("my_ml: {0} == comp_ml: {1}".format(my_ml, comp_ml))
else:
print("my_ml: {0} != comp_ml: {1}".format(my_ml, comp_ml))
Where data_rows = each row in the data set
Where data_pairs = {your_item_name, competitor_item_name}

You could use a lambda function to do that.
import pandas as pd
import re
d = {
'Us':
['Parfume one 50ml', 'Parfume two 100ml'],
'Competitor':
['Parfume uno 50ml', 'Parfume dos 200ml']
}
df = pd.DataFrame(data=d)
df['Eq'] = df.apply(lambda x : 'Yes' if re.search(r'(\d+)ml', x['Us']).group(1) == re.search(r'(\d+)ml', x['Competitor']).group(1) else "No", axis = 1)
Result:
Doesn't matter whether 'ml' is in the end of in the middle of the string.

Related

How create a sqlalchemy delete query with multiples parameter from a loop

I'm new in python and sqlalchemy.
I already have a delete method working if I construct the where conditions by hand.
Now, I need to read the columns and values from an enter request in yaml format and create the where conditions.
#enter data as yaml
items:
- item:
table: [MyTable,OtherTable]
filters:
field_id: 1234
#other_id: null
Here is what I try and can't go ahead:
for i in use_case_cfg['items']:
item = i.get('item')
for t in item['table']:
if item['filters']:
filters = item['filters']
where_conditions = ''
count = 0
for column, value in filters.items():
aux = str(getattr(t, column) == bindparam(value))
if count == 0:
where_conditions += aux
else:
where_conditions += ', ' + aux
count += 1
to_delete = inv[t].__table__.delete().where(text(where_conditions))
#to_delete = t.__table__.delete().where(getattr(t, column) == value)
else:
to_delete = inv[t].__table__.delete()
CoreData.session.execute(to_delete)
To me, it looks ok, but when I run, I got the error below:
sqlalchemy.exc.StatementError: (sqlalchemy.exc.InvalidRequestError) A value is required for bind parameter '9876'
[SQL: DELETE FROM MyTable WHERE "MyTable".field_id = %(1234)s]
[parameters: [{}]]
(Background on this error at: http://sqlalche.me/e/cd3x)
Can someone explain to me what is wrong or the proper way to do it?
Thanks.
There are two problems with the code.
Firstly,
str(getattr(t, column) == bindparam(value))
is binding the value as a placeholder, so you end up with
WHERE f2 = :Bob
but it should be the name that maps to the value in filters (so the column name in your case), so you end up with
WHERE f2 = :f2
Secondly, multiple WHERE conditions are being joined with a comma, but you should use AND or OR, depending on what you are trying to do.
Given a model Foo:
class Foo(Base):
__tablename__ = 'foo'
id = sa.Column(sa.Integer, primary_key=True)
f1 = sa.Column(sa.Integer)
f2 = sa.Column(sa.String)
Here's a working version of a segment of your code:
filters = {'f1': 2, 'f2': 'Bob'}
t = Foo
where_conditions = ''
count = 0
for column in filters:
aux = str(getattr(t, column) == sa.bindparam(column))
if count == 0:
where_conditions += aux
else:
where_conditions += ' AND ' + aux
count += 1
to_delete = t.__table__.delete().where(sa.text(where_conditions))
print(to_delete)
session.execute(to_delete, filters)
If you aren't obliged to construct the WHERE conditions as strings, you can do it like this:
where_conditions = [(getattr(t, column) == sa.bindparam(column))
for column in filters]
to_delete = t.__table__.delete().where(sa.and_(*where_conditions))
session.execute(to_delete, filters)

Add gender column searching rows that contains info from 2 tables

I have a df that contains some emails:
Email
jonathat0420#email.com
12alexander#email.com
14abcdjoanna#email.com
maria44th#email.com
mikeasddf#email.com
I need to add a second column with the gender.
I will have 2 lists:
male_names = ['john', 'alex']
female_names = ['maria', joanna']
My output should look like that:
Email Gender
jonathat0420#email.com 1
12alexander#email.com 1
14abcdjoanna#email.com 2
maria44th#email.com 2
mikeasddf#email.com
I would need to search the emails that contains the names from the lists and if they are in the emails to add them a number, like "1" for males, 2 for "females" and leave empty for the emails without matching in the lists.
Can anybody help me with this?
You could simply use a map, like this:
def isinlist(email, names):
for name in names:
if name in email:
return True
return False
df.loc[:, 'Gender'] = df.Email.map(lambda x : 1 if isinlist(x, male_names) else (2 if isinlist(x, female_names) else None))
However, there are going to be a lot of ambiguous cases that risk being classified erroneously - e.g., "alexandra#email.com" would be classified as male, since alex is the list of male names.
Maybe you could implement a slighly more complex "best match" logic like this?
def maxmatchlen(email, names): # = length of longest name from list that is contained in the email
return max([len(name) for name in names if name in email] + [0]) # append a 0 to avoid empty lists
def f(email, male_names = male_names, female_names = female_names):
male_maxmatchlen = maxmatchlen(email, male_names)
female_maxmatchlen = maxmatchlen(email, female_names)
if male_maxmatchlen > female_maxmatchlen:
return 1
elif female_maxmatchlen > male_maxmatchlen:
return 2
else: # ambiguous case
return None
df.loc[:, 'Gender'] = df.Email.map(f)
It looks like you first must determine if the email contains a name. You can loop through both male and female. That will determine if the name is "in" the email. Then you could make a list or a dictionary of these.
#!/usr/bin/env python3
import os
def get_emails(filepath):
"""Open the data file and read the lines - return a list"""
with open(filepath, "r") as f:
email_list = f.readlines()
for email in email_list:
print(f'Email = {email}')
print(f'The total number of emails = {len(email_list)}')
return email_list
def find_names(email_list):
"""loop through the email list and see if each one contains a male or female name - return dictionary of tuples"""
male_names = ['john', 'alex', 'mike', 'jonathat']
female_names = ['maria', 'joanna']
name_dict = {}
for email in email_list:
for name_f in female_names:
if name_f in email:
data= (name_f , 1)
name_dict[email] = data
print(f"{email} is for {name_f} and is female {data[1]}")
continue
for name_m in male_names:
if name_m in email:
data= (name_m , 2)
name_dict[email] = data
print(f"{email} is for {name_m} and is male {data[1]}")
continue
return name_dict
if __name__ == '__main__':
your_Datafile = r"D:\Share\email.txt"
email_list = get_emails(your_Datafile)
my_dictionary = find_names(email_list)
print(my_dictionary)
for email, data in my_dictionary.items():
print(data[0], data[1], email)

How to read a csv file and sum values based on user input?

Read a CSV file
User have to enter the Mobile number
Program should show the Data usage (i.e. Arithmetic Operation Adding Uplink & downlink) to get the result (Total Data Used)
Here is Example of CSV file
Time_stamp; Mobile_number; Download; Upload; Connection_start_time; Connection_end_time; location
1/2/2020 10:43:55;7777777;213455;2343;1/2/2020 10:43:55;1/2/2020 10:47:25;09443
1/3/2020 10:33:10;9999999;345656;3568;1/3/2020 10:33:10;1/3/2020 10:37:20;89442
1/4/2020 11:47:57;9123456;345789;7651;1/4/2020 11:11:10;1/4/2020 11:40:22;19441
1/5/2020 11:47:57;9123456;342467;4157;1/5/2020 11:44:10;1/5/2020 11:59:22;29856
1/6/2020 10:47:57;7777777;213455;2343;1/6/2020 10:43:55;1/6/2020 10:47:25;09443
With pandas
import pandas as pd
# read in data
df = pd.read_csv('test.csv', sep=';')
# if there are really spaces at the beginning of the column names, they should be removed
df.columns = [col.strip() for col in df.columns]
# sum Download & Upload for all occurrences of the given number
usage = df[['Download', 'Upload']][df.Mobile_number == 7777777].sum().sum()
print(usage)
>>> 431596
if you want Download and Upload separately
# only 1 sum()
usage = df[['Download', 'Upload']][df.Mobile_number == 7777777].sum()
print(usage)
Download 426910
Upload 4686
with user input
This assumes the Mobile_number column has be read into the dataframe as an int
input is a str so it must be converted to int to match the type in the dataframe
df.Mobile_number == 7777777 not df.Mobile_number == '7777777'
number = int(input('Please input a phone number (numbers only)'))
usage = df[['Download', 'Upload']][df.Mobile_number == number].sum().sum()
With no imported modules
# read file and create dict of phone numbers
phone_dict = dict()
with open('test.csv') as f:
for i, l in enumerate(f.readlines()):
l = l.strip().split(';')
if (i != 0):
mobile = l[1]
download = int(l[2])
upload = int(l[3])
if phone_dict.get(mobile) == None:
phone_dict[mobile] = {'download': [download], 'upload': [upload]}
else:
phone_dict[mobile]['download'].append(download)
phone_dict[mobile]['upload'].append(upload)
print(phone_dict)
{'+917777777777': {'download': [213455, 213455], 'upload': [2343, 2343]},
'+919999999999': {'download': [345656], 'upload': [3568]},
'+919123456654': {'download': [345789], 'upload': [7651]},
'+919123456543': {'download': [342467], 'upload': [4157]}}
# function to return usage
def return_usage(data: dict, number: str):
download_usage = sum(data[number]['download'])
upload_usage = sum(data[number]['upload'])
return download_usage + upload_usage
# get user input to return usage
number = input('Please input a phone number')
usage = return_usage(phone_dict, number)
print(usage)
>>> Please input a phone number (numbers only) +917777777777
>>> 431596
The csv is not too much readable, but you could take a look at his library https://pandas.pydata.org/
Once installed you could use:
# ask for the mobile number here
mobile_number = input('phone number? ')
df = pandas.read_csv('data.csv')
# here you will get the data for that user phone
user_data = df[df['Mobile_number'] == mobile_number].copy()
# not pretty sure in this step
user_data['download'].sum()

Function return empty dataframe

I would like to make a loop to create a dataframe that gathers the lines of an input dataframe, which have common points.
My problem : When I apply the function, the output dataframe is empty...
yet with a print (output) in the loop, we see that the program works .. I do not understand, i tried to change return position but that doesn't work
Thank you in advance for your help !
def group (dataframe, identifiant, output):
for i in range(len(identifiant)):
ident = identifiant.loc[i,"IDCTV"]
# print(ident)
for j in range(len(dataframe)):
if dataframe.loc[j,"IDCONTREVENANT"] == ident:
di = dataframe.loc[j, "DATE_INFRACTION"]
nt = dataframe.loc[j,"NOTRAIN"]
genre = dataframe.loc[j,"CODEETATCIVIL"]
age = dataframe.loc[j,"AGE"]
# print(di, nt, genre, age)
for k in range(len(dataframe)):
if k != j :
if dataframe.loc[k,"DATE_INFRACTION"] == di and dataframe.loc[k,"NOTRAIN"] == nt:
idgroup = dataframe.loc[k,"IDCONTREVENANT"]
genreidgroup = dataframe.loc[k,"CODEETATCIVIL"]
ageidgroup = dataframe.loc[k,"AGE"]
output = output.append({ "IDREF" : ident ,"CODEETATCIVILREF" : genre,"AGEREF" : age ,"IDCTV" : idgroup,"CODEETATCIVILCTV" : genreidgroup,"AGECTV" : ageidgroup}, ignore_index = True)
print(output)
return output
group(df,IDCTV,df_groups)
print(df_groups)
I think you want to change
group(df,IDCTV,df_groups)
to
df_groups = group(df,IDCTV,df_groups)
Right now you're calling the group funciton and doing all that calculation, but you're not saving the output anywhere. So when you run print(df_groups) it prints out whatever it was before you called the function.

Index similar entries in Python

I have a column of data (easily imported from Google Docs thanks to gspread) that I'd like to intelligently align. I ingest entries into a dictionary. Input can include email, twitter handle or a blog URL. For example:
mike.j#gmail.com
#mikej45
j.mike#world.eu
_http://tumblr.com/mikej45
Right now, the "dumb" version is:
def NomineeCount(spreadsheet):
worksheet = spreadsheet.sheet1
nominees = worksheet.col_values(6) # F = 6
unique_nominees = {}
for c in nominees:
pattern = re.compile(r'\s+')
c = re.sub(pattern, '', c)
if unique_nominees.has_key(c) == True: # If we already have the name
unique_nominees[c] += 1
else:
unique_nominees[c] = 1
# Print out the alphabetical list of nominees with leading vote count
for w in sorted(unique_nominees.keys()):
print string.rjust(str(unique_nominees[w]), 2)+ " " + w
return nominees
What's an efficient(-ish) way to add in some smarts during the if process?
You can try with defaultdict:
from collections import defaultdict
unique_nominees = defaultdict(lambda: 0)
unique_nominees[c] += 1

Categories