How to apply a pandas geocode function to Pyspark column - python

Table is like this
id
ADDRESS
0
6101 SUMMITVIEW AVE STE 200 YAKIMA
1
527 CEDAR WAY SUITE 105 OAKMONT
2
1700 N ROSE AVE SUITE 460 OXNARD
3
1275 YORK AVE NEW YORK
4
2300 MANCHESTER EXPY A SUITE 101 A COLUMBUS
5
401 N MICHIGAN AVE CHICAGO
6
111 GROSSMAN DR INTERNAL MEDICINE BRAINTREE
7
1850 N CENTRAL AVE STE 1600 PHOENIX
8
47 NEW SCOTLAND AVENUE ALBANY MEDICAL CENTER A...
9
201 N VINE ST EL DORADO
10
4420 LAKE BOONE TRL RALEIGH
11
2727 W HOLCOMBE BLVD HOUSTON
12
850 PETER BRYCE BLVD TUSCALOOSA
13
1803 WEHRLI RD NAPERVILLE
14
4321 N MACDILL AVE STE 203 TAMPA
15
111 CONTINENTAL DR SUITE 412 NEWARK
16
1834 E INNOVATION PARK DR ORO VALLEY
17
880 KEMPSVILLE RD SUITE 2200 NORFOLK
18
701 PRINCETON AVE SW BIRMINGHAM
19
4729 COUNTY ROAD 101 MINNETONKA
import pandas as pd
import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import matplotlib.pyplot as plt
import folium
from folium.plugins import FastMarkerCluster
locator = Nominatim(user_agent="myGeocoder")
from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(locator.geocode,min_delay_seconds=0.0, error_wait_seconds=1.0, swallow_exceptions=True, return_value_on_exception=None)
apprix_1_na['location'] = apprix_1_na['ADDRESS'].apply(geocode)
apprix_1_na['point'] = apprix_1_na['location'].apply(lambda loc: tuple(loc.point) if loc enter code hereelse None)
I want this code to work in Pyspark for longitude and latitude

I'll show a "complex" example with GoogleV3 API. It is easy suitable to your case
from geopy.geocoders import GoogleV3
from pyspark.sql.functions import col, udf
from pyspark.sql.types import FloatType, ArrayType
df = spark.createDataFrame([("123 Fake St, Springfield, 12345, USA",),("1000 N West Street, Suite 1200 Wilmington, DE 19801, USA",)], ["address"])
df.display()
address
123 Fake St, Springfield, 12345, USA
1000 N West Street, Suite 1200 Wilmington, DE 19801, USA
#udf(returnType=ArrayType(FloatType()))
def geoloc(address):
api = 'your_api_key_here'
geolocator = GoogleV3(api)
#get lat_long
return geolocator.geocode(address)[1]
#find coord
df = df.withColumn('geocode', geoloc(col('address')))
#separate tuple
df = df.withColumn("latitude", col('geocode').getItem(0))\
.withColumn("longitude", col('geocode').getItem(1))
df.display()
address
geocode
latitude
longitude
123 Fake St, Springfield, 12345, USA
[44.046238, -123.022026]
44.046238
-123.022026
1000 N West Street, Suite 1200 Wilmington, DE 19801, USA
[39.74717, -75.54999]
39.74717
-75.54999

Related

Pandas Groupby Head by Percentage of Row Counts

I have a dataframe:
state city score
CA San Francisco 80
CA San Francisco 90
...
NC Raleigh 44
NY New York City 22
I want to do a groupby.head(), but instead of an integer value, I want to select the top 80%, sorted by Score, of each state-city combo.
So if CA, San Francisco has 100 rows, and NC, Raleigh has 20 rows, the final dataframe would have the top 80 score rows for CA, San Francisco, and the top 16 score rows for NC, Raleigh.
So the final result code might look something like:
df.sort_values('score', ascending=False).groupby(['State', 'City']).head(80%)
Thanks!
from io import StringIO
import pandas as pd
# sample data
s = """state,city,score
CA,San Francisco,80
CA,San Francisco,90
CA,San Francisco,30
CA,San Francisco,10
CA,San Francisco,70
CA,San Francisco,60
CA,San Francisco,50
CA,San Francisco,40
NC,Raleigh,44
NC,Raleigh,54
NC,Raleigh,64
NC,Raleigh,14
NY,New York City,22
NY,New York City,12
NY,New York City,32
NY,New York City,42
NY,New York City,52"""
df = pd.read_csv(StringIO(s))
sample = .8 # 80%
# sort the values and create a groupby object
g = df.sort_values('score', ascending=False).groupby(['state', 'city'])
# use list comprehension to iterate over each group
# for each group, calculate what 80% is
# in other words, the length of each group multiplied by .8
# you then use int to round down to the whole number
new_df = pd.concat([data.head(int(len(data)*sample)) for _,data in g])
state city score
1 CA San Francisco 90
0 CA San Francisco 80
4 CA San Francisco 70
5 CA San Francisco 60
6 CA San Francisco 50
7 CA San Francisco 40
10 NC Raleigh 64
9 NC Raleigh 54
8 NC Raleigh 44
16 NY New York City 52
15 NY New York City 42
14 NY New York City 32
12 NY New York City 22
Use nlargest and compute the number of selected rows per group based on its length, i.e. 0.8 * len(group)
res = (
df.groupby(['State', 'City'], group_keys=False)
.apply(lambda g: g.nlargest(int(0.8*len(g)), "Score"))
)

Extracting strings from a list by a specific word

I have this column of addresses in pandas and I want to select only those addresses in the US, however I either get an empty string or thrown an error.
Here's what I have done:
0 238 Lincoln St, Hahnville, LA 70057, USA
1 101 Home Pl Ln, Hahnville, LA 70057, USA
2 1250 Poydras St, New Orleans, LA 70113, USA
3 1117 Broadway STE 401, Tacoma, WA 98402, USA
4 2715 N Junett St, Tacoma, WA 98407, USA
5 Hillstrust Primary School, 29 Nethan St, Govan, Glasgow G51 3LX, UK
6 5778+JM Godalming, UK
7 569 Durham Rd, Low Fell, Gateshead NE9 5EY, UK
8 Pennine Way, Barnard Castle DL12, UK
9 14 Studios Rd, Shepperton TW17 0QW, UK
matching = [s for s in final_data["full_address"] if "USA" in s]
matching
#returns: TypeError: argument of type 'float' is not iterable
#Whereas
ab = [final_data["full_address"]]
matching = [s for s in ab if "USA" in s]
matching
#returns: []
Expected output:
0 238 Lincoln St, Hahnville, LA 70057, USA
1 101 Home Pl Ln, Hahnville, LA 70057, USA
2 1250 Poydras St, New Orleans, LA 70113, USA
3 1117 Broadway STE 401, Tacoma, WA 98402, USA
4 2715 N Junett St, Tacoma, WA 98407, USA
Try this:
import pandas as pd
data = {
'full_address': [
'238 Lincoln St, Hahnville, LA 70057, USA', '101 Home Pl Ln, Hahnville, LA 70057, USA', '1250 Poydras St, New Orleans, LA 70113, USA',
'1117 Broadway STE 401, Tacoma, WA 98402, USA', '2715 N Junett St, Tacoma, WA 98407, USA', '5778+JM Godalming, UK', '569 Durham Rd, Low Fell, Gateshead NE9 5EY, UK',
'Pennine Way, Barnard Castle DL12, UK', '14 Studios Rd, Shepperton TW17 0QW, UK'
]
}
df = pd.DataFrame(data)
matching = df[df['full_address'].str.contains("USA")]
print(matching)
Output:
full_address
0 238 Lincoln St, Hahnville, LA 70057, USA
1 101 Home Pl Ln, Hahnville, LA 70057, USA
2 1250 Poydras St, New Orleans, LA 70113, USA
3 1117 Broadway STE 401, Tacoma, WA 98402, USA
4 2715 N Junett St, Tacoma, WA 98407, USA
Hello I have tried to recreate your scenario and in this it is working I just added a query with contain statement on specific column which is here is country
import pandas as pd
# Build cars DataFrame
names = ['238 Lincoln St, Hahnville, LA 70057, USA', '101 Home Pl Ln, Hahnville, LA 70057, USA', 'Hillstrust Govan, Glasgow G51 3LX, UK']
dict = { 'country':names}
cars = pd.DataFrame(dict)
b = cars.query('country.str.contains("USA")', engine='python')
print(b)

Scraping of the BBB site converting JSON to a DataFrame

I would like to put this information into a dataframe and then export to excel. So far tutorials in python produce table errors. No luck converting the JSON data to a data frame.
Any tips would be very helpful.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import urlopen
import bs4
import requests, re, json
headers = {'User-Agent':'Mozilla/5.0'}
r = requests.get('https://www.bbb.org/search?find_country=USA&find_entity=10126-000&find_id=357_10126-000_alias&find_text=roofing&find_type=Category&page=1&touched=1', headers = headers)
p = re.compile(r'PRELOADED_STATE__ = (.*?);')
data = json.loads(p.findall(r.text)[0])
results = [(item['businessName'], ' '.join([item['address'],item['city'], item['state'], item['postalcode']]), item['phone']) for item in data['searchResult']['results']]
print(results)
import re
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0'}
r = requests.get('https://www.bbb.org/search?find_country=USA&find_entity=10126-000&find_id=357_10126-000_alias&find_text=roofing&find_type=Category&page=1&touched=1', headers = headers)
p = re.compile(r'PRELOADED_STATE__ = (.*?);')
data = json.loads(p.findall(r.text)[0])
results = [(item['businessName'], ' '.join([item['address'],item['city'], item['state'], item['postalcode']]), item['phone']) for item in data['searchResult']['results']]
df = pd.DataFrame(results, columns=['Business Name', 'Address', 'Phone'])
print(df)
df.to_csv('data.csv')
Prints:
Business Name Address Phone
0 Trinity Roofing, LLC Stilwell KS 66085-8238 [(913) 432-4425, (303) 699-7999]
1 Trinity Roofing, LLC 14241 E 4th Ave Ste 5-300 Aurora CO 80011-8733 [(913) 432-4425, (303) 699-7999]
2 CMR Construction & Roofing of Texas, LLC 12500 E US Highway 40, Ste. B1 Independence MO... [(855) 376-6326, (855) 766-3267]
3 All-Star Home Repairs LLC 1806 Grove Ave Richmond VA 23220-4506 [(804) 405-9337]
4 MadSky Roofing & Restoration, LLC Bank of America Center, 16th Floor 1111 E. Mai... [(855) 623-7597]
5 Robert Owens Roofing Bealeton VA 22712-9706 [(540) 878-3544]
6 Proof Seal of Athens PO Box 80732 Canton OH 447080732 [(330) 685-6363]
7 Proof Seal of Athens Athens OH 45701-1847 [(330) 685-6363]
8 Tenecela General Services Corp 57 Anderson St Lowell MA 01852-5357 None
9 Water Tight Roofing & Siding 57 Whitehall Way Hyannis MA 02601-2149 [(508) 364-8323]
10 Tenecela General Services Corp 745 Broadway St Fl 2 Lowell MA 01854-3137 None
11 Just In Time Roofing & Contracting, LLC ----- Ft Worth TX 76102 [(888) 666-3122, (254) 296-8016, (888) 370-3331]
12 Paramount Construction of Southerntier NY Inc. 323 Fluvanna Ave. Jamestown NY 14701 [(716) 487-0093]
13 Paramount Construction of Southerntier NY Inc. P O Box 488 Falconer NY 14733 [(716) 487-0093]
14 Paramount Construction of Southerntier NY Inc. 1879 Lyndon Boulevard Falconer NY 14733 [(716) 487-0093]
And saves data.csv (screenshot from LibreOffice):

Split column in DataFrame based on item in list

I have the following table and would like to split each row into three columns: state, postcode and city. State and postcode are easy, but I'm unable to extract the city. I thought about splitting each string after the street synonyms and before the state, but I seem to be getting the loop wrong as it will only use the last item in my list.
Input data:
Address Text
0 11 North Warren Circle Lisbon Falls ME 04252
1 227 Cony Street Augusta ME 04330
2 70 Buckner Drive Battle Creek MI
3 718 Perry Street Big Rapids MI
4 14857 Martinsville Road Van Buren MI
5 823 Woodlawn Ave Dallas TX 75208
6 2525 Washington Avenue Waco TX 76710
7 123 South Main St Dallas TX 75201
The output I'm trying to achieve (for all rows, but I only wrote out the first two to save time)
City State Postcode
0 Lisbon Falls ME 04252
1 Augusta ME 04330
My code:
# Extract postcode and state
df["Zip"] = df["Address Text"].str.extract(r'(\d{5})', expand = True)
df["State"] = df["Address Text"].str.extract(r'([A-Z]{2})', expand = True)
# Split after these substrings
street_synonyms = ["Circle", "Street", "Drive", "Road", "Ave", "Avenue", "St"]
# This is where I got stuck
df["Syn"] = df["Address Text"].apply(lambda x: x.split(syn))
df
Here's a way to do that:
import pandas as pd
# data
df = pd.DataFrame(
['11 North Warren Circle Lisbon Falls ME 04252',
'227 Cony Street Augusta ME 04330',
'70 Buckner Drive Battle Creek MI',
'718 Perry Street Big Rapids MI',
'14857 Martinsville Road Van Buren MI',
'823 Woodlawn Ave Dallas TX 75208',
'2525 Washington Avenue Waco TX 76710',
'123 South Main St Dallas TX 75201'],
columns=['Address Text'])
# Extract postcode and state
df["Zip"] = df["Address Text"].str.extract(r'(\d{5})', expand=True)
df["State"] = df["Address Text"].str.extract(r'([A-Z]{2})', expand=True)
# Split after these substrings
street_synonyms = ["Circle", "Street", "Drive", "Road", "Ave", "Avenue", "St"]
def find_city(address, state, street_synonyms):
for syn in street_synonyms:
if syn in address:
# remove street
city = address.split(syn)[-1]
# remove State and postcode
city = city.split(state)[0]
return city
df['City'] = df.apply(lambda x: find_city(x['Address Text'], x['State'], street_synonyms), axis=1)
print(df[['City', 'State', 'Zip']])
"""
City State Zip
0 Lisbon Falls ME 04252
1 Augusta ME 04330
2 Battle Creek MI NaN
3 Big Rapids MI NaN
4 Van Buren MI 14857
5 Dallas TX 75208
6 nue Waco TX 76710
7 Dallas TX 75201
"""

deleting row from Dataframe results in distribute dataframe in Python

I have below dataframe nbr2:
Postal_Code Borough Neighborhood
0 M1B Scarborough Rouge, Malvern
1 M4C East York Woodbine Heights
2 M4E East Toronto The Beaches
3 M4L East Toronto The Beaches West, India Bazaar
4 M4M East Toronto Studio District
5 M4N Central Toronto Lawrence Park
On applying below code to filter out rows:
neighbor = nbr2.drop(nbr2[nbr2['Borough'].str.contains("Toronto")==False].index, axis=0, inplace=True)
the dataframe gets distributes like below:
Postal_Code Borough \
37 M4E East Toronto
41 M4K East Toronto
42 M4L East Toronto
43 M4M East Toronto
Neighborhood
37 The Beaches
41 The Danforth West\n, Riverdale
42 The Beaches West\n, India Bazaar
43 Studio District\n
below code also results in similar structure:
# define the dataframe columns
column_names = ['Postal_Code','Borough', 'Neighborhood']
# instantiate the dataframe
neighbor = pd.DataFrame(columns=column_names)
neighbor = nbr2.drop(nbr2[nbr2['Borough'].str.contains("Toronto")==False].index, axis=0, inplace=True)
use
pd.set_option('display.expand_frame_repr', False)

Categories