Define a function to a dataframe - python

Consider this code in Python
filter_2020 = hdb_million[hdb_million["year"]==2020]
town_2020 = filter_2020["town"].unique()
results_2020 = len(town_2020)
print(town_2020)
print("in 2020 the number of towns are:", results_2020)
print("\n")
filter_2021 = hdb_million[hdb_million["year"]==2021]
town_2021 = filter_2021["town"].unique()
results_2021 = len(town_2021)
print(town_2021)
print("in 2021 the number of towns are:", results_2021)
print("\n")
filter_2022 = hdb_million[hdb_million["year"]==2022]
town_2022 = filter_2022["town"].unique()
results_2022 = len(town_2022)
print(town_2022)
Output
['BUKIT MERAH' 'CLEMENTI' 'TOA PAYOH' 'KALLANG/WHAMPOA' 'QUEENSTOWN'
'BISHAN' 'CENTRAL AREA' 'ANG MO KIO' 'GEYLANG']
in 2020 the number of towns are: 9
['BISHAN' 'BUKIT MERAH' 'CENTRAL AREA' 'CLEMENTI' 'QUEENSTOWN' 'SERANGOON'
'TOA PAYOH' 'BUKIT TIMAH' 'KALLANG/WHAMPOA' 'ANG MO KIO']
in 2021 the number of towns are: 10
['ANG MO KIO' 'BISHAN' 'BUKIT TIMAH' 'CENTRAL AREA' 'CLEMENTI' 'GEYLANG'
'KALLANG/WHAMPOA' 'QUEENSTOWN' 'SERANGOON' 'TOA PAYOH' 'BUKIT MERAH'
'YISHUN' 'PASIR RIS' 'WOODLANDS' 'BUKIT BATOK' 'HOUGANG' 'MARINE PARADE'
'PUNGGOL' 'TAMPINES' 'BEDOK']
in 2022 the number of towns are: 20
Instead of repeating the codes, can I define a function to arrive at the same output ? I tried several def functions but am not successful. Most grateful for any insights. thank you

You can iterate through a loop,
for year in [2020, 2021, 2022]:
filter_year = hdb_million[hdb_million["year"]== year]
town = filter_year["town"].unique()
results = len(town)
print(town)
print(f"in {year} the number of towns are:", results)
print("\n")

If you just want the print things to be repeated using a function.
def print_town_info(year, town_input):
filter_year = town_input[town_input["year"] == year]
town = filter_year["town"].unique()
print(town)
print("in " + str(year) + " the number of towns are:", len(town))
print("\n")
Then if you want it in a loop:
for y in [2020, 2021, 2022]:
print_town_info(y, hdb_million)

Related

How To Add Specific Keys In Nested Dictionary In Python

I may be formatting this dictionary wrong (my first time doing this)
I have a dictionary of every province with corrected ID and added it to value "Canada". I'm trying to add the population of ALL the provinces in the nested dictionary
ontario = dict(capital="Toronto", largest="Toronto", population=14826276)
quebec = dict(capital="Quebec City", largest="Montreal", population=8604495)
nova_Scotia = dict(capital="Halifax", largest='Halifax', population=992055)
new_Brunswick = dict(capital="Fredricton", largest='Moncton', population=789225)
manitoba = dict(capital="Winnipeg", largest="Winnipeg", population=1383765)
canada = {ontario, quebec, nova_Scotia, new_brunswick, manitoba, british_columbia, prince_edward_island, saskatchewan, alberta, newfoundland_and_labrador}
for key, value in canada.items():
if value and 'population' in value.keys():
# Adding all values of population to receive total population of canada
sum += value['population']
print(sum)
thanks again in advance.
You didn't create dictionary but set (which doesn't have keys)
To create dictionary you would need keys like
canada = {1:ontario, 2:quebec, 3:nova_scotia, 4:new_brunswick, 5:manitoba}
canada = {"Ontario":ontario, "Quebec":quebec, "Nova Scotia":nova_scotia, "New Brunswick":new_brunswick, "Manitoba":manitoba}
and then you can use canada.items() and sum population
(I use variable total because there is function sum())
# --- before `for`-loop ---
total = 0
# --- `for`-loop ---
for key, value in canada.items():
total += value['population']
# --- after `for`-loop ---
print(total)
or shorter
total = sum(value['population'] for value in canada.values())
and then you can add to this dictionary
canada['total'] = total
Full code:
ontario = dict(capital="Toronto", largest="Toronto", population=14826276)
quebec = dict(capital="Quebec City", largest="Montreal", population=8604495)
nova_scotia = dict(capital="Halifax", largest='Halifax', population=992055)
new_brunswick = dict(capital="Fredricton", largest='Moncton', population=789225)
manitoba = dict(capital="Winnipeg", largest="Winnipeg", population=1383765)
canada = {1:ontario, 2:quebec, 3:nova_scotia, 4:new_brunswick, 5:manitoba}#, british_columbia, prince_edward_island, saskatchewan, alberta, newfoundland_and_labrador
total = 0
for key, value in canada.items():
total += value['population']
print(total)
#total = sum(value['population'] for value in canada.values())
canada['total'] = total
print(canada)
I only added the listed 5 provinces into the nested dictionary.
I used a for loop to calculate the total population of Canada (the
sum of the 5 listed provinces).
Note that my nested dictionary has the same format as a normal dictionary,
"a key : value" --> "1 : ontario"
ontario = dict(capital="Toronto", largest="Toronto", population=14826276)
quebec = dict(capital="Quebec City", largest="Montreal", population=8604495)
nova_Scotia = dict(capital="Halifax", largest='Halifax', population=992055)
new_Brunswick = dict(capital="Fredricton", largest='Moncton', population=789225)
manitoba = dict(capital="Winnipeg", largest="Winnipeg", population=1383765)
canada = {1:ontario, 2:quebec, 3:nova_Scotia, 4:new_Brunswick, 5:manitoba}
#canada = {ontario, quebec, nova_Scotia, new_brunswick, manitoba, british_columbia, prince_edward_island, saskatchewan, alberta, newfoundland_and_labrador}
sum = 0
for providence in canada:
# Adding all values of population to receive total population of canada
sum += (canada[providence]["population"])
print(sum)
Try this one.
ontario = dict(capital="Toronto", largest="Toronto", population=14826276)
quebec = dict(capital="Quebec City", largest="Montreal", population=8604495)
nova_Scotia = dict(capital="Halifax", largest='Halifax', population=992055)
new_Brunswick = dict(capital="Fredricton", largest='Moncton', population=789225)
manitoba = dict(capital="Winnipeg", largest="Winnipeg", population=1383765)
canada_list = [ontario, quebec, nova_Scotia, new_Brunswick, manitoba]
total = 0
for item in canada_list:
# Adding all values of population to receive total population of canada
total += item.get('population', 0)
print("Total: {}".format(total))
Output:
Total: 26595816

Trying to find averages from a .txt but I keep getting ValueError: could not convert string to float: ''

I'm using the txt file: https://drive.google.com/file/d/1-VrWf7aqiqvnshVQ964zYsqaqRkcUoL1/view?usp=sharin
I'm running the script:
data = f.read()
ny_sum=0
ny_count=0
sf_sum=0
sf_count=0
for line in data.split('\n'):
print(line)
parts = line.split('\t')
city = parts[2]
amount = float(parts[4])
if city == 'San Francisco':
sf_sum = sf_sum + amount
elif city == 'New York':
ny_sum = ny_sum + amount
ny_count = ny_count + 1
ny_avg = ny_sum / ny_count
sf_avg = sf_sum / sf_count
#print(ny_avg, sf_avg)
f = open('result_file.txt', 'w')
f.write('The average transaction amount based on {} transactions in New York is {}\n'.format(ny_count, ny_avg))
f.write('The average transaction amount based on {} transactions in San Francisco is {}\n'.format(sf_count, sf_avg))
if ny_avg>sf_avg:
f.write('New York has higher average transaction amount than San Francisco\n')
else:
f.write('San Francisco has higher average transaction amount than New York\n')
f.close()
And I ALWAYS get the error:
ValueError: could not convert string to float: ''
I'm pretty new-ish to Python and I'm really not sure what I'm doing wrong here. I'm trying to get averages for New York and San Francisco, then export the results AND the comparison to a txt results file
This should give you what you're looking for:
from collections import defaultdict as DD
with open('New Purchases.txt') as pfile:
sums = DD(lambda: [0.0, 0])
for line in [line.split('\t') for line in pfile]:
try:
k = line[2]
sums[k][0] += float(line[4])
sums[k][1] += 1
except Exception:
pass
for k in ['San Francisco', 'New York']:
v = sums.get(k, [0.0, 1])
print(f'Average for {k} = ${v[0]/v[1]:.2f}')
I have re-arranged the code. I agree with BrutusFocus that the splits are making it difficult to read exactly the location on each row. I have set it so if it sees the location at any point in the row, it counts it.
with open("data.txt", "r") as f:
data = f.read()
ny_sum=0
ny_count=0
sf_sum=0
sf_count=0
for line in data.split('\n'):
parts = line.split('\t')
city = parts[2]
amount = float(parts[4])
print(city, amount)
if "New York" in line:
ny_sum = ny_sum + amount
ny_count = ny_count + 1
elif "San Francisco" in line:
sf_sum = sf_sum + amount
sf_count = sf_count + 1
ny_avg = ny_sum / ny_count
sf_avg = sf_sum / sf_count
#print(ny_avg, sf_avg)
f = open('result_file.txt', 'w')
f.write('The average transaction amount based on {} transactions in New York is
{}\n'.format(ny_count, ny_avg))
f.write('The average transaction amount based on {} transactions in San
Francisco is {}\n'.format(sf_count, sf_avg))
if ny_avg>sf_avg:
f.write('New York has higher average transaction amount than San Francisco\n')
else:
f.write('San Francisco has higher average transaction amount than New York\n')
f.close()

Python float print in real time

Is there any way to print the numbers in real times instead of printing them one by one? I have 6 different countries
china = 1399746872
india = 1368138206
USA = 327826334
Japan = 12649000
Russia = 146804372
Sweden = 10379295
I change this numbers in the script but how do I print them so I see them change?
!EDITED!
I want to kind of overwrite this list everytime it prints so I see the numbers go up
Countries = []
china = 1399746872
india = 1368138206
USA = 327826334
Japan = 12649000
Russia = 146804372
Sweden = 10379295
Countries.append(china)
Countries.append(india)
Countries.append(USA)
Countries.append(Japan)
Countries.append(Russia)
Countries.append(Sweden)
print(Countries)
you could use os.system("cls") to clear the console.
I made a little demo:
import time, sys, json, os
from random import randint
vals = {
"china": 1399746872,
"india": 1368138206,
"USA": 327826334,
"Japan": 12649000,
"Russia": 146804372,
"Sweden": 10379295
}
for _ in range(100):
# clear console
os.system("cls")
# print values
[print(f"{k}: {v}") for k, v in vals.items()]
# renew values with random generated integers
vals = {k:randint(0, 1000000) for k in vals}
# sleep 5s
time.sleep(5)

Generating a random sentence in python

I have the following input which contains Aircraft and Services
AirCrafts={'Cargo Aircraft':'1','International Aircraft':'2','Domestic Aircraft':'3'}
Services={
'AirCrafts':[1,2,3],
'Cargo Aircraft':[
"Fedx","DHFL","Emirates SkyCargo","Qatar Airways Cargo","Cathay Pacific Cargo"
],
'International Aircraft':[
"Air India","Air France","BA"
],
'Domestic Aircraft':[
"TruJet","Indigo","AirAsia"
]
}
I have a generated sentence the following sentence using SenGen Function.
def SenGen(alpha,beta):
for keys,values in alpha.items():
for key,value in beta.items():
if key in keys:
if values == []:
print(f"{keys} are not found.")
if len(values) > 1:
print(f"{keys} are ", end="\n")
for i, val in enumerate(values):
if len(values) == 1:
if "Not found" in value:
print(f"{keys} are {val}. ", end="\n")
else:
print(f"{keys} is {val}. ", end = "\n")
else:
if i == len(values)-1:
print(f"•{val}. ", end="\n")
elif i == len(value)-2:
print(f"•{val} ", end="\n")
else:
print(f"•{val}, ", end="\n")
My generated output is below after running SenGen(Services,AirCrafts).
SenGen(Services,AirCrafts)
International Aircraft are
•Air India,
•Air France,
•BA.
Domestic Aircraft are
•TruJet,
•Indigo,
•AirAsia.
In the above output, I have international Aircraft are and Domestic Aircraft are.
in place of International Aircraft, I want to generate a proper sentence such that my output must look like
International Aircrafts that run from the various airport of India are
for the Domestic Aircrafts
Domestic Aircraft which run in within India are
How can I generate a proper sentence as shown above?
So basically this is the line which needed change as per your requirement
if len(values) > 1:
print(f"{keys} are ", end="\n")
You can change it to this, and you will be good to go:
if len(values) > 1:
# for international the statement is different
if keys == "International Aircraft":
print(f"{keys} that run from the various airport of India are ", end="\n")
elif keys == "Domestic Aircraft":
print(f"{keys} which run in within India are", end="\n")
else:
# your else print
Additional Information
To return the statement, do not do like this, you need to use String Concatenation. So in order to do the return, do this in place of print
if len(values) > 1:
# for international the statement is different
if keys == "International Aircraft":
return str(keys) + " that run from the various airport of India are \n"
elif keys == "Domestic Aircraft":
return str(keys) + " which run in within India are \n"
else:
# your else print
In your code, you need to plug in the additional text. Otherwise the program will not know that you want these texts added.
if len(values) > 1:
if "International" in keys:
print(f"{keys} that run from the various airports of India are ", end="\n")
elif "Domestic" in keys:
print(f"{keys} which run in within India are ", end="\n")
else:
print(f"{keys} are ", end="\n")
Here's the code that provides you a return statement.
def SenGen(alpha,beta):
temp = '' #store the results for each iteration of `AirCrafts`
for keys,values in alpha.items():
if keys == 'International Aircraft':
temp += '\n' + keys + ' that run from the various airports of India are :\n' + ',\n'.join(beta[keys])
elif keys == 'Domestic Aircraft':
temp += '\n' + keys + ' which run within India are :\n' + ',\n'.join(beta[keys])
elif keys == 'Cargo Aircraft':
temp += '\n' + keys + ' are :\n' + ',\n'.join(beta[keys])
else:
temp += '\n' + keys + ' are not found'
temp += '\n'
return temp
x = SenGen(AirCrafts,Services)
print (x)
If you don't want the results to be printed separately on each line, you can remove the \n from the string.
Output for your reference:
>>> print (x)
Cargo Aircraft are :
Fedx,
DHFL,
Emirates SkyCargo,
Qatar Airways Cargo,
Cathay Pacific Cargo
International Aircraft that run from the various airports of India are :
Air India,
Air France,
BA
Domestic Aircraft which run within India are :
TruJet,
Indigo,
AirAsia
>>> x
'\nCargo Aircraft are :\nFedx,\nDHFL,\nEmirates SkyCargo,\nQatar Airways Cargo,\nCathay Pacific Cargo\n\nInternational Aircraft that run from the various airports of India are :\nAir India,\nAir France,\nBA\n\nDomestic Aircraft which run within India are :\nTruJet,\nIndigo,\nAirAsia\n'
>>>

Determining most common name from web scraped birth name data

I have the task to do web scraping from this page https://www.ssa.gov/cgi-bin/popularnames.cgi. There you can find a list of the most common birth names. Now I have to find the most common name that both girls and boys have for a given year (in other words, the exact same name is used in both genders), but I don't know how I am able to do that. With the code below I solved the previous task to output the list for a given year but I have no clue how I can modify my code so I get the most common name that both girls and boys have.
import requests
import lxml.html as lh
url = 'https://www.ssa.gov/cgi-bin/popularnames.cgi'
string = input("Year: ")
r = requests.post(url, data=dict(year=string, top="1000", number="n" ))
doc = lh.fromstring(r.content)
tr_elements = doc.xpath('//table[2]//td[2]//tr')
cols = []
for col in tr_elements[0]:
name = col.text_content()
number = col.text_content()
cols.append((number, []))
count=0
for row in tr_elements[1:]:
i = 0
for col in row:
val = col.text_content()
cols[i][1].append(val)
i += 1
if(count<4):
print(val, end = ' ')
count += 1
else:
count=0
print(val)
Here's one approach. The first step is to group the data by name and record how many genders have used the name and their aggregate total. After that, we can filter the structure by names with more than one gender using it. Finally, we sort this multi-gender list by counts and take the 0-th element. This is our most popular multi-gender name for the year.
import requests
import lxml.html as lh
url = "https://www.ssa.gov/cgi-bin/popularnames.cgi"
year = input("Year: ")
response = requests.post(url, data=dict(year=year, top="1000", number="n"))
doc = lh.fromstring(response.content)
tr_elements = doc.xpath("//table[2]//td[2]//tr")
column_names = [col.text_content() for col in tr_elements[0]]
names = {}
most_common_shared_names_by_year = {}
for row in tr_elements[1:-1]:
row = [cell.text_content() for cell in row]
for i, gender in ((1, "male"), (3, "female")):
if row[i] not in names:
names[row[i]] = {"count": 0, "genders": set()}
names[row[i]]["count"] += int(row[i+1].replace(",", ""))
names[row[i]]["genders"].add(gender)
shared_names = [
(name, data) for name, data in names.items() if len(data["genders"]) > 1
]
most_common_shared_names = sorted(shared_names, key=lambda x: -x[1]["count"])
print("%s => %s" % most_common_shared_names[0])
If you're curious, here are the results since 2000:
2000 => Tyler, 22187
2001 => Tyler, 19842
2002 => Tyler, 18788
2003 => Ryan, 20171
2004 => Madison, 20829
2005 => Ryan, 18661
2006 => Ryan, 17116
2007 => Jayden, 17287
2008 => Jayden, 19040
2009 => Jayden, 19053
2010 => Jayden, 18641
2011 => Jayden, 18064
2012 => Jayden, 16952
2013 => Jayden, 15462
2014 => Logan, 14478
2015 => Logan, 13753
2016 => Logan, 12099
2017 => Logan, 15117

Categories