Add gender column searching rows that contains info from 2 tables

Add gender column searching rows that contains info from 2 tables - python

I have a df that contains some emails:
Email
jonathat0420#email.com
12alexander#email.com
14abcdjoanna#email.com
maria44th#email.com
mikeasddf#email.com
I need to add a second column with the gender.
I will have 2 lists:
male_names = ['john', 'alex']
female_names = ['maria', joanna']
My output should look like that:
Email Gender
jonathat0420#email.com 1
12alexander#email.com 1
14abcdjoanna#email.com 2
maria44th#email.com 2
mikeasddf#email.com
I would need to search the emails that contains the names from the lists and if they are in the emails to add them a number, like "1" for males, 2 for "females" and leave empty for the emails without matching in the lists.
Can anybody help me with this?

You could simply use a map, like this:
def isinlist(email, names):
for name in names:
if name in email:
return True
return False
df.loc[:, 'Gender'] = df.Email.map(lambda x : 1 if isinlist(x, male_names) else (2 if isinlist(x, female_names) else None))
However, there are going to be a lot of ambiguous cases that risk being classified erroneously - e.g., "alexandra#email.com" would be classified as male, since alex is the list of male names.
Maybe you could implement a slighly more complex "best match" logic like this?
def maxmatchlen(email, names): # = length of longest name from list that is contained in the email
return max([len(name) for name in names if name in email] + [0]) # append a 0 to avoid empty lists
def f(email, male_names = male_names, female_names = female_names):
male_maxmatchlen = maxmatchlen(email, male_names)
female_maxmatchlen = maxmatchlen(email, female_names)
if male_maxmatchlen > female_maxmatchlen:
return 1
elif female_maxmatchlen > male_maxmatchlen:
return 2
else: # ambiguous case
return None
df.loc[:, 'Gender'] = df.Email.map(f)

It looks like you first must determine if the email contains a name. You can loop through both male and female. That will determine if the name is "in" the email. Then you could make a list or a dictionary of these.
#!/usr/bin/env python3
import os
def get_emails(filepath):
"""Open the data file and read the lines - return a list"""
with open(filepath, "r") as f:
email_list = f.readlines()
for email in email_list:
print(f'Email = {email}')
print(f'The total number of emails = {len(email_list)}')
return email_list
def find_names(email_list):
"""loop through the email list and see if each one contains a male or female name - return dictionary of tuples"""
male_names = ['john', 'alex', 'mike', 'jonathat']
female_names = ['maria', 'joanna']
name_dict = {}
for email in email_list:
for name_f in female_names:
if name_f in email:
data= (name_f , 1)
name_dict[email] = data
print(f"{email} is for {name_f} and is female {data[1]}")
continue
for name_m in male_names:
if name_m in email:
data= (name_m , 2)
name_dict[email] = data
print(f"{email} is for {name_m} and is male {data[1]}")
continue
return name_dict
if __name__ == '__main__':
your_Datafile = r"D:\Share\email.txt"
email_list = get_emails(your_Datafile)
my_dictionary = find_names(email_list)
print(my_dictionary)
for email, data in my_dictionary.items():
print(data[0], data[1], email)

Related

Finding farmers markets from zip codes or town names

I'm working on a small project where the user enters a zip code or town name and the computer outputs all farmer's market's that are in that zip code or town name.
I have a function that turns markets.txt (a txt which includes state, zipcode, town, city, name of farmers market) into 2 dictionaries: one that maps zip codes to farmers market tuples and another that maps towns to zip codes. My main program first checks if the user input is a zip code or if it is a town name and then (if the user gives a zip code) takes a list of all zip codes and finds a farmers market tuple to then format for readability as an output. In the case that it is a town, it's largely the same except the function retrieves a zip code from the user input town name to then get farmer's market tuples and format it.
I'm looking through markets.txt however and there are multiple farmer's markets for town names (such as this town called Granville) but the program only prints 1 rather than all of them.
Thanks so much!
Here is the code I have so far:
d1 = {}
d2 = {}
def read_markets(filename):
"""
Read in the farmers market data from the file named filename and return
a tuple of two objects:
1) A dictionary mapping zip codes to lists of farmers market tuples.
2) A dictionary mapping towns to sets of zip codes.
"""
with open(filename) as f:
for line in f:
s = line.strip().split('#')
#print(type(s[4]))
# s[4] are zipcodes, s[3] are towns, s[:4] is state, name, address, city
d1[s[4]] = (s[:4])
d2[s[3]] = (s[4])
#print(s[:4])
return d1, d2
def print_market(market):
"""
Returns a string representing the farmers market tuple
passed to the market parameter.
"""
#input is market tuple
name = market[1]
address = market[2]
city = market[3]
state = market[0]
zcode = (list(d1.keys())[list(d1.values()).index(market)])
final = name + "\n" + address + "\n" + city + ", " + state + " " + zcode
#print(final)
return final
# print(b)
if __name__ == "__main__":
# This main program first reads in the markets.txt once (using the function read_markets
# and then asks the user repeatedly to enter a zip code or
# a town name (in a while loop until the user types "quit").
FILENAME = "markets.txt"
c = 0
try:
zip_to_market, town_to_zips = read_markets(FILENAME)
while c < 1:
u_in = input("enter zip code or town name: ")
if u_in == "quit":
c = 1
else:
#check if its a zip code
if u_in.isdigit():
print("Ok, I will look for farmers markets matching that zipcode")
askzip = str(u_in)
#list of all zipcodes
mlist = d1.keys()
#look for corresponding zipcode in dictionary that maps zipcodes to market tuples
if askzip in mlist:
out1 = d1.get(askzip)
print(print_market(out1))
else:
print('No corresponding farmers markets exist for that zipcode')
#user input is town name
else:
print("Ok, I will look for farmers markets in that town")
asktown = str(u_in)
tlist = d2.keys()
if asktown in tlist:
outzip = d2.get(asktown)
#print(outzip)
#we got zip from our dictionary mapping zip codes to town names so now
mlist = d1.keys()
if outzip in mlist:
# print(outzip)
out1 = d1.get(outzip)
print(print_market(out1))
else:
print('No corresponding farmers markets exist for that town name')
except (FileNotFoundError, IOError):
print("Error reading {}".format(FILENAME))
#testing things
#read_markets("markets.txt")
#market = ['Wyoming', 'Wyoming Fresh Market', '121 W 15th Street', 'Cheyenne']
#print_market(market)
and a pastebin of the snippet of markets.txt that include multiple farmers markets in one town(Granville):
https://pastebin.com/cFdb7HZ5

I fixed the problem! :D
The problem was that the dictionaries I was using didn't store multiples of the same value. This could be solved using a defaultdict(list) and appending lists. Then to navigate the nested lists, I made a separate method to search for values using similar to what Ironkey suggested. I also changed the way zip codes were retrieved from market tuples by using another defaultdict matching multiple unique addresses to zip codes.
Final fixed code that accounts for multiple markets in 1 zip code/town name.
from collections import defaultdict
Dzcode_mtuples = defaultdict(list)
Dtowns_mtuples = defaultdict(list)
Dtowns_zcodes = defaultdict(list)
Daddress_zcodes = defaultdict(list)
a = set()
def read_markets(filename):
"""
Read in the farmers market data from the file named filename and return
a tuple of two objects:
1) A dictionary mapping zip codes to lists of farmers market tuples.
2) A dictionary mapping towns to sets of zip codes.
"""
with open(filename) as f:
for line in f:
s = line.strip().split('#')
#zip codes in a set
a.add(s[4])
#s[4] are zipcodes, s[3] are towns, s[:4] is state, name, address, city
#dictionary mapping 1 zip code to multiple farmers markets tuples
#dictionary mapping 1 town name to multiple farmers markets tuples
#dictionary mapping towns to sets of zip codes
#dictionary mapping unique addresses to zip codes
mtuple = [s[:4]]
Dzcode_mtuples[s[4]].append(mtuple)
Dtowns_mtuples[s[3]].append(mtuple)
Dtowns_zcodes[s[3]].append(a)
Daddress_zcodes[s[2]].append(s[4])
# print(market_name)
#print(s[:4])
return Dzcode_mtuples, Dtowns_mtuples
def search_nested(mylist, val):
for i in range(len(mylist)):
for j in range(len(mylist[i])):
#print i,j
#print(mylist)
if mylist[i][j] == val:
return mylist[i]
return str(val) + ' not found'
def list2string(s):
str1 = ""
#for element i in list s transverse in the string
for i in s:
str1 += i
return str1
def print_market(market):
"""
Returns a human-readable string representing the farmers market tuple
passed to the market parameter.
"""
#input is market tuple
name = market[1]
address = market[2]
city = market[3]
state = market[0]
#zcode is ['zipcode'] list from the dictionary that matched address to zcode. address is unique to each farmers market so its good to use to search
zcode = (list(Daddress_zcodes.values())[list(Daddress_zcodes.keys()).index(market[2])])
#the list2string method makes zcode a clean string
final = name + "\n" + address + "\n" + city + ", " + state + " " + list2string(zcode)
#print(final)
return final
# print(b)
if __name__ == "__main__":
# This main program first reads in the markets.txt once (using the function
# from part (a)), and then asks the user repeatedly to enter a zip code or
# a town name (in a while loop until the user types "quit").
FILENAME = "markets.txt"
c = 0
try:
zip_to_market, town_to_zips = read_markets(FILENAME)
while c < 1:
u_in = input("enter zip code or town name: ")
if u_in == "quit":
c = 1
else:
#check if its a zip code
if u_in.isdigit():
print("\nOk, I will look for farmers markets matching that zipcode \n")
askzip = str(u_in)
#list of all zipcodes
zlist = Dzcode_mtuples.keys()
#print(zlist)
#look for corresponding zipcode in dictionary that maps zipcodes to market tuples
if askzip in zlist:
out1 = Dzcode_mtuples.get(askzip)
for x in out1:
print(print_market(x[0]) + "\n")
else:
print('No corresponding farmers markets exist for that zipcode')
#user input is town name
else:
print("\nOk, I will look for farmers markets in that town \n")
asktown = str(u_in)
tlist = Dtowns_mtuples.keys()
if asktown in tlist:
#look for user input town in all the towns
out1 = Dtowns_mtuples.get(asktown)
#out1 is a list of mtuples that satisfy this
#print(out1[0][0])
for x in out1:
#print(Dzcode_mtuples.values())
#print(x[0])
print(print_market(x[0]) + "\n")
else:
print('No corresponding farmers markets exist for that town name')
except (FileNotFoundError, IOError):
print("Error reading {}".format(FILENAME))
#testing things
#read_markets("markets.txt")
#market = ['Wyoming', 'Wyoming Fresh Market', '121 W 15th Street', 'Cheyenne']
#print_market(market)

How to read a csv file and sum values based on user input?

Read a CSV file
User have to enter the Mobile number
Program should show the Data usage (i.e. Arithmetic Operation Adding Uplink & downlink) to get the result (Total Data Used)
Here is Example of CSV file
Time_stamp; Mobile_number; Download; Upload; Connection_start_time; Connection_end_time; location
1/2/2020 10:43:55;7777777;213455;2343;1/2/2020 10:43:55;1/2/2020 10:47:25;09443
1/3/2020 10:33:10;9999999;345656;3568;1/3/2020 10:33:10;1/3/2020 10:37:20;89442
1/4/2020 11:47:57;9123456;345789;7651;1/4/2020 11:11:10;1/4/2020 11:40:22;19441
1/5/2020 11:47:57;9123456;342467;4157;1/5/2020 11:44:10;1/5/2020 11:59:22;29856
1/6/2020 10:47:57;7777777;213455;2343;1/6/2020 10:43:55;1/6/2020 10:47:25;09443

With pandas
import pandas as pd
# read in data
df = pd.read_csv('test.csv', sep=';')
# if there are really spaces at the beginning of the column names, they should be removed
df.columns = [col.strip() for col in df.columns]
# sum Download & Upload for all occurrences of the given number
usage = df[['Download', 'Upload']][df.Mobile_number == 7777777].sum().sum()
print(usage)
>>> 431596
if you want Download and Upload separately
# only 1 sum()
usage = df[['Download', 'Upload']][df.Mobile_number == 7777777].sum()
print(usage)
Download 426910
Upload 4686
with user input
This assumes the Mobile_number column has be read into the dataframe as an int
input is a str so it must be converted to int to match the type in the dataframe
df.Mobile_number == 7777777 not df.Mobile_number == '7777777'
number = int(input('Please input a phone number (numbers only)'))
usage = df[['Download', 'Upload']][df.Mobile_number == number].sum().sum()
With no imported modules
# read file and create dict of phone numbers
phone_dict = dict()
with open('test.csv') as f:
for i, l in enumerate(f.readlines()):
l = l.strip().split(';')
if (i != 0):
mobile = l[1]
download = int(l[2])
upload = int(l[3])
if phone_dict.get(mobile) == None:
phone_dict[mobile] = {'download': [download], 'upload': [upload]}
else:
phone_dict[mobile]['download'].append(download)
phone_dict[mobile]['upload'].append(upload)
print(phone_dict)
{'+917777777777': {'download': [213455, 213455], 'upload': [2343, 2343]},
'+919999999999': {'download': [345656], 'upload': [3568]},
'+919123456654': {'download': [345789], 'upload': [7651]},
'+919123456543': {'download': [342467], 'upload': [4157]}}
# function to return usage
def return_usage(data: dict, number: str):
download_usage = sum(data[number]['download'])
upload_usage = sum(data[number]['upload'])
return download_usage + upload_usage
# get user input to return usage
number = input('Please input a phone number')
usage = return_usage(phone_dict, number)
print(usage)
>>> Please input a phone number (numbers only) +917777777777
>>> 431596

The csv is not too much readable, but you could take a look at his library https://pandas.pydata.org/
Once installed you could use:
# ask for the mobile number here
mobile_number = input('phone number? ')
df = pandas.read_csv('data.csv')
# here you will get the data for that user phone
user_data = df[df['Mobile_number'] == mobile_number].copy()
# not pretty sure in this step
user_data['download'].sum()

Split Purchase Order according to Product Category

I wanna split the Purchase Order according to Product Category.
My Code so far:
_inherit ='purchase.order.line'
split = fields.Boolean(string='Split')
_inherit ='purchase.order'
def btn_split_rfq(self):
flag = []
for record in self:
if record.order_line:
for rec in record.order_line:
rec.split = True # oles tis eggrafes true
flag.append(rec.product_id.categ_id.id) # lista me ta categ ids
newlist=[ii for n,ii in enumerate(flag) if ii not in flag[:n]] # ta krata mono mia fora an uparxoun polles
for index in newlist: # gia 2 katigories 8a treksi 2 fores
quotation_id = self.copy()
for index in record.order_line:
if index.split:
self.env['purchase.order.line'].browse(index.id).unlink()
else:
raise ValidationError(_('Please Select Order Line To Split'))
The code so far, is split to multiple POs e.g. if i have 2 type of categories is making 2 POs but and the two POs is taking and the 4 products not only of product category(see image below).
Output:
But i want this kind of Output:
Any solution?

I tried to just ignore your code example, because it is difficult to understand for me. If you want try out my attempt:
def button_split_by_prod_categ(self):
self.ensure_one()
groups = {}
# group lines by product category
for line in self.order_line:
if line.product_id.categ_id not in groups:
groups[line.product_id.categ_id] = line
else:
groups[line.product_id.categ_id] =| line
skip = True
orders = self
for lines in groups.values():
# skip first group
if skip:
skip = False
continue
# or create a new order without lines and connect
# the group's lines with it
else:
default_values = {'order_line': []}
new_order = self.copy(default=default_values)
lines.write({'order_id': new_order.id})
orders |= new_order
# now you could return a list view with all orders
# or just do 'nothing'
return

I found solution to my problem, i dont think is pretty but it does the job. Thanks # CZoellner and #Charif DZ for the effort!!!
def btn_split_rfq(self):
flag =[]
for record in self:
if record.order_line:
for rec in record.order_line: #run for all products on purchase order
flag.append(rec.product_id.categ_id.id) # append product category ids
categ_ids=[ii for n,ii in enumerate(flag) if ii not in flag[:n]] # filter list,keep only one time every product category id
categ_ids.sort() # sorting list
for index in categ_ids: # will run 2 times if there is 2 product categories
quotations_ids = [self.copy()]
for order_line in quotations_ids:
prods = self.env['purchase.order.line'].search([('product_categ_id' ,'!=',index),('order_id','=',int(order_line))])
for ids in prods:
self.env['purchase.order.line'].browse(ids.id).unlink()
else:
raise ValidationError(_('Not Available Purchase Order Lines'))

Check string for specific format of substring, how to..?

Two strings. My items name:
Parfume name EDT 50ml
And competitor's items name:
Parfume another name EDP 60ml
And i have a long list of these names in one column, competitors names in other column, and I want to leave only those rows in dataframe, that have same amount of ml in both my and competitors names no matter what everything else in these strings look like. So how do I find a substring ending with 'ml' in a bigger string? I could simply do
"**ml" in competitors_name
to see if they both contain the same amount of ml.
Thank you
UPDATE
'ml' is not always at the end of string. It might look like this
Parfume yet another great name 60ml EDP

Try this:
import re
def same_measurement(my_item, competitor_item, unit="ml"):
matcher = re.compile(r".*?(\d+){}".format(unit))
my_match = matcher.match(my_item)
competitor_match = matcher.match(competitor_item)
return my_match and competitor_match and my_match.group(1) == competitor_match.group(1)
my_item = "Parfume name EDT 50ml"
competitor_item = "Parfume another name EDP 50ml"
assert same_measurement(my_item, competitor_item)
my_item = "Parfume name EDT 50ml"
competitor_item = "Parfume another name EDP 60ml"
assert not same_measurement(my_item, competitor_item)

You could use the python Regex library to select the 'xxml' values for each of your data rows and then do some logic to check if they match.
import re
data_rows = [["Parfume name EDT", "Parfume another name EDP 50ml"]]
for data_pairs in data_rows:
my_ml = None
comp_ml = None
# Check for my ml matches and set value
my_ml_matches = re.search(r'(\d{1,3}[Mm][Ll])', data_pairs[0])
if my_ml_matches != None:
my_ml = my_ml_matches[0]
else:
print("my_ml has no ml")
# Check for comp ml matches and set value
comp_ml_matches = re.search(r'(\d{1,3}[Mm][Ll])', data_pairs[1])
if comp_ml_matches != None:
comp_ml = comp_ml_matches[0]
else:
print("comp_ml has no ml")
# Print outputs
if (my_ml != None) and (comp_ml != None):
if my_ml == comp_ml:
print("my_ml: {0} == comp_ml: {1}".format(my_ml, comp_ml))
else:
print("my_ml: {0} != comp_ml: {1}".format(my_ml, comp_ml))
Where data_rows = each row in the data set
Where data_pairs = {your_item_name, competitor_item_name}

You could use a lambda function to do that.
import pandas as pd
import re
d = {
'Us':
['Parfume one 50ml', 'Parfume two 100ml'],
'Competitor':
['Parfume uno 50ml', 'Parfume dos 200ml']
}
df = pd.DataFrame(data=d)
df['Eq'] = df.apply(lambda x : 'Yes' if re.search(r'(\d+)ml', x['Us']).group(1) == re.search(r'(\d+)ml', x['Competitor']).group(1) else "No", axis = 1)
Result:
Doesn't matter whether 'ml' is in the end of in the middle of the string.

List comes back as empty when retrieveing data from website ; Python

I am trying to parse data from a website by inserting the data into a list, but the list comes back empty.
url =("http://www.releasechimps.org/resources/publication/whos-there-md- anderson")
http = urllib3.PoolManager()
r = http.request('Get',url)
soup = BeautifulSoup(r.data,"html.parser")
#print(r.data)
loop = re.findall(r'<td>(.*?)</td>',str(r.data))
#print(str(loop))
newLoop = str(loop)
#print(newLoop)
for x in range(1229):
if "\\n\\t\\t\\t\\t" in loop[x]:
loop[x] = loop[x].replace("\\n\\t\\t\\t\\t","")
list0_v2.append(str(loop[x]))
print(loop[x])
print(str(list0_v2))

Edit: Didn't really have anything else going on, so I made your data format into a nice list of dictionaries. There's a weird <td height="26"> on monkey 111, so I had to change the regex slightly.
Hope this helps you, I did it cause I care about the monkeys man.
import html
import re
import urllib.request
list0_v2 = []
final_list = []
url = "http://www.releasechimps.org/resources/publication/whos-there-md-anderson"
data = urllib.request.urlopen(url).read()
loop = re.findall(r'<td.*?>(.*?)</td>', str(data))
for item in loop:
if "\\n\\t\\t\\t\\t" or "em>" in item:
item = item.replace("\\n\\t\\t\\t\\t", "").replace("<em>", "")\
.replace("</em>", "")
if " " == item:
continue
list0_v2.append(item)
n = 1
while len(list0_v2) != 0:
form = {"n":0, "name":"", "id":"", "gender":"", "birthdate":"", "notes":""}
try:
if list0_v2[5][-1] == '.':
numb, name, ids, gender, birthdate, notes = list0_v2[0:6]
form["notes"] = notes
del(list0_v2[0:6])
else:
raise Exception('foo')
except:
numb, name, ids, gender, birthdate = list0_v2[0:5]
del(list0_v2[0:5])
form["n"] = int(numb)
form["name"] = html.unescape(name)
form["id"] = ids
form["gender"] = gender
form["birthdate"] = birthdate
final_list.append(form)
n += 1
for li in final_list:
print("{:3} {:10} {:10} {:3} {:10} {}".format(li["n"], li["name"], li["id"],\
li["gender"], li["birthdate"], li["notes"]))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Add gender column searching rows that contains info from 2 tables - python

Related

Finding farmers markets from zip codes or town names

How to read a csv file and sum values based on user input?

Split Purchase Order according to Product Category

Check string for specific format of substring, how to..?

List comes back as empty when retrieveing data from website ; Python

Categories

Resources