Beautiful soup not loading new page after Selenium click - python

The first page is loaded and parsed as expected but after the clicking on Next page, the BS4 does not get the new page from driver.page_source
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import random
import time
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
def parse_html(pagesource, count):
soup = BeautifulSoup(driver.page_source, 'html.parser')
tables = soup.findChildren('table')
# This will get the first (and only) table. Your page may have more.
my_table = tables[0]
table_body = my_table.find('tbody')
all_rows = table_body.find_all('tr')
# print (all_rows[0])
for row in all_rows:
print (count)
count += 1
try:
path_body = row.find("td", class_="views-field-company-name")
path = path_body.find("a")['href']
company_name = path_body.find("a").text
company_name = company_name.strip()
print (company_name)
issue_datetime = row.find("td", class_="views-field-field-letter-issue-datetime")
# print (type(issue_datetime.find("time")['datetime']))
issue_recepient_office = row.find("td", class_="views-field-field-building").string
issue_recepient_office = issue_recepient_office.strip()
# print (issue_recepient_office)
detailed_description = row.find("td", class_="views-field-field-detailed-description-2").string
if detailed_description:
detailed_description = detailed_description.strip()
else:
detailed_description = ""
#print (detailed_description)
except:
pass
url = 'https://www.fda.gov/inspections-compliance-enforcement-and-criminal-investigations/compliance-actions-and-activities/warning-letters'
driver.get(url)
count = 1
parse_html(driver.page_source, count)
for i in range(0,3):
time.sleep(10)
#print(driver.page_source.encode('utf-8'))
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#datatable_next a'))).click()
time.sleep(30)
parse_html(driver.page_source, count)
driver.quit()
Output:
1
Ruth Special Food Store LLC
Foreign Supplier Verification Program (FSVP)
2
EarthLab, Inc., dba Wise Woman Herbals
3
Big Olaf Creamery LLC dba Big Olaf
CGMP/Food/Prepared, Packed or Held Under Insanitary Conditions/Adulterated/L. monocytogenes
4
Bainbridge Beverage West, LLC
Juice HACCP/CGMP for Foods/Adulterated/Insanitary Conditions
5
VapeL1FE, LLC
Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded
6
Mike Millenkamp Dairy Cattle
7
Empowered Diagnostics LLC
Unapproved Products Related to the Coronavirus Disease 2019 (COVID-19)
8
RoyalVibe Health Ltd.
CGMP/QSR/Medical Devices/PMA/Adulterated/Misbranded
9
Land View, Inc.
CGMP/Medicated Feeds/Adulterated
10
Green Pharmaceuticals Inc.
1
2
3
4
5
6
7
8
9
10
1
2
3
4
5
6
7
8
9
10
1
2
3
4
5
6
7
8
9
10

none selenium solution:
import requests
from bs4 import BeautifulSoup
import pandas as pd
PAGE_LENGHT = 50
def get_letters(page: int):
start = page * PAGE_LENGHT
url = f"https://www.fda.gov/datatables/views/ajax?field_letter_issue_datetime=All&field_change_date_2=All&draw={page}&columns%5B0%5D%5Bdata%5D=0&columns%5B0%5D%5Bsearchable%5D=true&columns%5B0%5D%5Borderable%5D=true&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=1&columns%5B1%5D%5Bsearchable%5D=true&columns%5B1%5D%5Borderable%5D=true&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B2%5D%5Bdata%5D=2&columns%5B2%5D%5Bsearchable%5D=true&columns%5B2%5D%5Borderable%5D=true&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B3%5D%5Bdata%5D=3&columns%5B3%5D%5Bsearchable%5D=true&columns%5B3%5D%5Borderable%5D=true&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B4%5D%5Bdata%5D=4&columns%5B4%5D%5Bsearchable%5D=true&columns%5B4%5D%5Borderable%5D=true&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B5%5D%5Bdata%5D=5&columns%5B5%5D%5Bsearchable%5D=true&columns%5B5%5D%5Borderable%5D=true&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B6%5D%5Bdata%5D=6&columns%5B6%5D%5Bsearchable%5D=true&columns%5B6%5D%5Borderable%5D=true&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B7%5D%5Bdata%5D=7&columns%5B7%5D%5Bname%5D=&columns%5B7%5D%5Bsearchable%5D=true&columns%5B7%5D%5Borderable%5D=false&columns%5B7%5D%5Bsearch%5D%5Bregex%5D=false&start={start}&length={PAGE_LENGHT}&search%5Bregex%5D=false&_drupal_ajax=1&_wrapper_format=drupal_ajax&view_base_path=inspections-compliance-enforcement-and-criminal-investigations%2Fcompliance-actions-and-activities%2Fwarning-letters%2Fdatatables-data&view_display_id=warning_letter_solr_block&view_dom_id=4605f153788b3a17043d0e031eb733846503177581602cd9fd58ecd78629801b&view_name=warning_letter_solr_index&view_path=%2Finspections-compliance-enforcement-and-criminal-investigations%2Fcompliance-actions-and-activities%2Fwarning-letters&total_items=3433"
letters = []
for letter in requests.get(url).json()['data']:
letters.append([BeautifulSoup(row, 'lxml').get_text(strip=True) for row in letter])
return letters
result = []
for i in range(0, 5):
result += get_letters(i)
df = pd.DataFrame(result)
print(df)
OUTPUT:
0 1 2 ... 5 6 7
0 12/27/2022 11/07/2022 Land View, Inc. ...
1 12/27/2022 11/22/2022 MD Pharmaceutical Supply, LLC ...
2 12/27/2022 06/01/2022 Supreme Fruit Produce, Inc. ...
3 12/27/2022 10/06/2022 Empowered Diagnostics LLC ...
4 12/27/2022 11/18/2022 RoyalVibe Health Ltd. ...
.. ... ... ... ... .. .. ..
245 08/11/2022 08/11/2022 The Juice Bar ...
246 08/09/2022 06/16/2022 InfuTronix LLC ...
247 08/09/2022 07/12/2022 Zyno Medical LLC ...
248 08/09/2022 07/28/2022 Vitti Labs, LLC ...
249 08/09/2022 07/22/2022 Muscle Feast, LLC ...
UPDATE
to find request use dev tools(f12 default in chrome)
now we need to figure out how to work with this data, this is a simple html text, and bs4 will help us with this. If link needed u can change letters.append to
letters.append({
'Posted Date': BeautifulSoup(letter[0], 'lxml').get_text(strip=True),
'Letter Issue Date': BeautifulSoup(letter[1], 'lxml').get_text(strip=True),
'Company Name': BeautifulSoup(letter[2], 'lxml').get_text(strip=True),
'Issuing Office': BeautifulSoup(letter[3], 'lxml').get_text(strip=True),
'Subject': BeautifulSoup(letter[4], 'lxml').get_text(strip=True),
'Link': 'https://www.fda.gov/' + BeautifulSoup(letter[2], 'lxml').find('a').get('href'),
})
And new output looks like:
Posted Date Letter Issue Date Company Name Issuing Office Subject Link
0 12/27/2022 11/07/2022 Land View, Inc. Division of Human and Animal Food Operations West VI CGMP/Medicated Feeds/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/land-view-inc-638704-11072022
1 12/27/2022 11/22/2022 MD Pharmaceutical Supply, LLC Division of Pharmaceutical Quality Operations I CGMP/Active Pharmaceutical Ingredient (API)/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/md-pharmaceutical-supply-llc-637815-11222022
2 12/27/2022 06/01/2022 Supreme Fruit Produce, Inc. Division of Southwest Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/supreme-fruit-produce-inc-631972-06012022
3 12/27/2022 10/06/2022 Empowered Diagnostics LLC Center for Devices and Radiological Health Unapproved Products Related to the Coronavirus Disease 2019 (COVID-19) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/empowered-diagnostics-llc-638164-10062022
4 12/27/2022 11/18/2022 RoyalVibe Health Ltd. Center for Devices and Radiological Health CGMP/QSR/Medical Devices/PMA/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/royalvibe-health-ltd-639553-11182022
5 12/27/2022 11/28/2022 Bainbridge Beverage West, LLC Division of Human and Animal Food Operations West V Juice HACCP/CGMP for Foods/Adulterated/Insanitary Conditions https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/bainbridge-beverage-west-llc-638942-11282022
6 12/27/2022 12/16/2022 Green Pharmaceuticals Inc. Division of Pharmaceutical Quality Operations IV Drug Product/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/green-pharmaceuticals-inc-635162-12162022
7 12/27/2022 12/16/2022 VapeL1FE, LLC Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/vapel1fe-llc-648624-12162022
8 12/27/2022 12/09/2022 Ruth Special Food Store LLC Division of Northeast Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/ruth-special-food-store-llc-644551-12092022
9 12/27/2022 11/28/2022 Mike Millenkamp Dairy Cattle Division of Human and Animal Food Operations West II New Animal Drug/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/mike-millenkamp-dairy-cattle-640782-11282022
10 12/27/2022 11/10/2022 EarthLab, Inc., dba Wise Woman Herbals Division of Human and Animal Food Operations West VI CGMP/Dietary Supplement/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/earthlab-inc-dba-wise-woman-herbals-634872-11102022
11 12/27/2022 12/09/2022 Big Olaf Creamery LLC dba Big Olaf Division of Human and Animal Food Operations East IV CGMP/Food/Prepared, Packed or Held Under Insanitary Conditions/Adulterated/L. monocytogenes https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/big-olaf-creamery-llc-dba-big-olaf-642758-12092022
12 12/22/2022 12/22/2022 BS Vapes LLC Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/bs-vapes-llc-647308-12222022
13 12/22/2022 12/22/2022 JP & SN Enterprises Inc. d/b/a eCigs International Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/jp-sn-enterprises-inc-dba-ecigs-international-647315-12222022
14 12/20/2022 11/08/2022 Dollar Tree, Inc. Office of Human and Animal Food Operations – West Division 3 Interstate Commerce/Food/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/dollar-tree-inc-629509-11082022
15 12/20/2022 07/27/2022 Sagent Pharmaceuticals, Inc. Division Pharmaceutical Quality Operations I CGMP/Drugs/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/sagent-pharmaceuticals-inc-636636-07272022
16 12/20/2022 11/21/2022 Nature’s Way Farms, LLC Division of Southwest Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/natures-way-farms-llc-641201-11212022
17 12/20/2022 12/08/2022 Nortec Quimica SA Center for Drug Evaluation and Research | CDER CGMP/Active Pharmaceutical Ingredient (API)/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/nortec-quimica-sa-639894-12082022
18 12/20/2022 11/30/2022 CHS Inc./CHS River Plains Division of Human and Animal Food Operations West I CGMP/Medicated Feeds/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/chs-incchs-river-plains-642790-11302022
19 12/20/2022 12/02/2022 DuPont Nutrition USA Inc. Division of Pharmaceutical Quality Operations I CGMP/Drug Products/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/dupont-nutrition-usa-inc-627211-12022022
20 12/20/2022 11/01/2022 Del Valle Import Corp. Division of Northeast Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/del-valle-import-corp-642784-11012022
21 12/20/2022 08/25/2022 Sree Nidhi Corp Center for Food Safety and Applied Nutrition (CFSAN) Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/sree-nidhi-corp-634266-08252022
22 12/20/2022 12/14/2022 Adarsh Daswani, M.D. Center for Drug Evaluation and Research | CDER Clinical Investigator https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/adarsh-daswani-md-648606-12142022
23 12/15/2022 12/15/2022 Vape King Inc. Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/vape-king-inc-646625-12152022
24 12/15/2022 12/15/2022 Vapor E-Cigarette, L.L.C. Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/vapor-e-cigarette-llc-646876-12152022
25 12/13/2022 12/02/2022 SV3, LLC d/b/a Mi-One Brands Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/sv3-llc-dba-mi-one-brands-647624-12022022
26 12/13/2022 12/07/2022 Centrient Pharmaceuticals India Private Limited Center for Drug Evaluation and Research | CDER CGMP/Active Pharmaceutical Ingredient (API)/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/centrient-pharmaceuticals-india-private-limited-640196-12072022
27 12/13/2022 11/22/2022 Cecilia Alvarez Division of Southwest Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/cecilia-alvarez-643706-11222022
28 12/13/2022 11/29/2022 Gobwa Exotic Imports Inc. Division of Northeast Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/gobwa-exotic-imports-inc-641031-11292022
29 12/13/2022 12/05/2022 Thriftmaster Texas, LLC. d/b/a ThriftMaster Global Holdings, Inc. and TM Global Biosciences, LLC Center for Drug Evaluation and Research | CDER Finished Pharmaceuticals/Unapproved New Drug/Misbranded/Adulterated Human Foods https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/thriftmaster-texas-llc-dba-thriftmaster-global-holdings-inc-and-tm-global-biosciences-llc-641057
30 12/13/2022 11/21/2022 Euphoria Fancy Food Inc. Division of Northeast Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/euphoria-fancy-food-inc-641801-11212022
31 12/08/2022 12/08/2022 Cloud House Vape Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/cloud-house-vape-647544-12082022
32 12/08/2022 12/08/2022 Vapors of Ohio Inc d/b/a Nostalgic Vapes Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/vapors-ohio-inc-dba-nostalgic-vapes-644739-12082022
33 12/06/2022 11/28/2022 AG Hair Limited Center for Drug Evaluation and Research | CDER CGMP/Finished Pharmaceuticals/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/ag-hair-limited-638646-11282022
34 12/06/2022 11/22/2022 Glenmark Pharmaceuticals Limited Center for Drug Evaluation and Research | CDER CGMP/Finished Pharmaceuticals/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/glenmark-pharmaceuticals-limited-637314-11222022
35 12/06/2022 09/23/2022 Saffron USA LLC Division of Human and Animal Food Operations East IV Unapproved New Drugs/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/saffron-usa-llc-629821-09232022
36 12/06/2022 10/24/2022 Cryos International USA LLC Division of Biological Products Operations I Deviations/CFR/Regulations for Human Cells, Tissues & Cellular Products (HCT/Ps) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/cryos-international-usa-llc-639696-10242022
37 12/06/2022 10/17/2022 Zuland Distributor Corp Division of Southwest Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/zuland-distributor-corp-638899-10172022
38 12/06/2022 11/07/2022 Manzela USA, LLC Division of Southwest Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/manzela-usa-llc-642268-11072022
39 12/06/2022 11/07/2022 Maliba African Market Corp. Division of Northeast Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/maliba-african-market-corp-642698-11072022
40 12/06/2022 11/30/2022 Kari Gran Inc. Division of Pharmaceutical Quality Operations IV CGMP/Finished Pharmaceuticals/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/kari-gran-inc-640035-11302022
41 12/01/2022 12/01/2022 Vapor Candy Inc d/b/a The Vape Stop Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/vapor-candy-inc-dba-vape-stop-645475-12012022
42 11/30/2022 11/30/2022 Jayde's Vapor Lounge Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/jaydes-vapor-lounge-645085-11302022
43 11/29/2022 11/10/2022 Vapor Plus OK LLC Center for Tobacco Products Family Smoking Prevention and Tobacco Control Act/Adulterated/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/vapor-plus-ok-llc-646225-11102022
44 11/29/2022 11/18/2022 "David M. Lubeck, M.D./Arbor Centers for EyeCare Center for Drug Evaluation and Research | CDER Clinical Investigator (Sponsor) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/david-m-lubeck-mdarbor-centers-eyecare-643531-11182022
45 11/29/2022 06/01/2022 Jam Jam Services, Inc. Division of Southeast Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/jam-jam-services-inc-630847-06012022
46 11/29/2022 09/19/2022 La Serranita Import and Export LLC Division of Northeast Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/la-serranita-import-and-export-llc-633743-09192022
47 11/29/2022 11/09/2022 J R Imports LLC Division of Southwest Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/j-r-imports-llc-643214-11092022
48 11/29/2022 09/01/2022 Shuzy Rock Inc. Division of Pharmaceutical Quality Operations I CGMP/Finished Pharmaceuticals/Adulterated https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/shuzy-rock-inc-630110-09012022
49 11/22/2022 10/19/2022 Pepe’s Foods Inc. Division of West Coast Imports Foreign Supplier Verification Program (FSVP) https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/pepes-foods-inc-640716-10192022
50 11/22/2022 11/14/2022 yourtramadol.com Center for Drug Evaluation and Research | CDER Finished Pharmaceuticals/Unapproved New Drug/Misbranded https://www.fda.gov//inspections-compliance-enforcement-and-criminal-investigations/warning-letters/yourtramadolcom-639959-11142022

Related

While loop data not appending to list outside of while loop

I am trying to scrape data, write it to a pd series then go into a while loop for the remaining pages of the website appending to the original series (located outside of the while loop) after each iteration. I'm not sure why this isn't working. Here's where I'm stuck:
current_url = 'https://www.yellowpages.com/search?search_terms=hvac&geo_location_terms=97080'
def get_data_run(current_url):
company_names1 = get_company_name(current_url)
print(company_names1) #1
page = 1
max_page = 3
company_names1 = paginate(current_url, page, max_page, company_names1)
print(company_names1) #2
def paginate(current_url, page, max_page, company_names1):
while (page <= max_page):
new_url = current_url + f"&page={page}"
print(new_url)
company_names = get_company_name(new_url)
company_names1.append(company_names)
print(company_names) #3
print(company_names1) #4
page +=1
if page == max_page:
return company_names1
def get_company_name(url):
company_names = []
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
box = list(soup.findAll("div", {"class": "result"}))
for i in range(len(box)):
try:
company_names.append(box[i].find("a", {"class": "business-name"}).text.strip())
except Exception:
company_names.append("null")
else:
continue
company_names = pd.Series(company_names, dtype='string')
return company_names
get_data_run(current_url)
I've labeled the different prints and all of the prints of company_names1 and company_names and each time company_names1 it prints the same series of companies even after appending company_names inside the while loop. The thing I can't understand is that when I print company_names (#3) it prints the next page of company names. I don't understand why its not appending inside the while loop then why it's not returning outside of the function successfully and printing the combined series in the #2 print. Thanks!
UPDATE:
Here is some sample output:
when I print #3:
(pyfinance) justinbenfit#MacBook-Pro-3 yellowpages_scrape % /usr/local/anaconda3/envs/pyfinance/bin/python /Users/justinbenfit/Desktop/yellowpages_scrape/test.py
0 Honke Heating & Air Conditioning
1 Climate Kings Heating & Ac
2 Mike's Truck & Auto Service
3 One Hour Heating & Air Conditioning
4 Morgan Heating & Cooling Inc
5 Rnr Heating Venting & Air Conditioning
6 Universal HVAC Inc
7 Mr Furnace
8 Affordable Excellence Heating
9 Green Air Products
10 David Eugene Neketin
11 Century Heating & Air Cond
12 Appliance Wizard
13 Precision Energy Solutions Inc.
14 Portland Heating & Air Conditioning Co
15 Mhc
16 American Pride Heating and Cooling, LLC
17 Tri Star Western
18 Comfort Zone Heat & Air Inc
19 Don's Air-Care Inc
20 Chuck's Heating & Cooling
21 Mt. Hood Heating Cooling & Refrigeration
22 Chuck's Heating & Cooling
23 Mr. Furnace
24 America's Same Day Service
25 Arctic Commercial Refrigeration LLC
26 Apex Refrigeration
27 Ben's Heating & Air Conditioning LLC
28 David's Appliance Inc
29 Wolcott Heating & Cooling
dtype: string
0 Air-Trix
1 Johnstone Supply
2 Buss Heating & Cooling Inc
3 The Heat Exchange
4 Hoodview Heating & Air Conditioning
5 Loomis Heating Cooling & Refrigeration
6 All About Air Heating & Cooling
7 Hanson Heating
8 Sparks Heating & Cooling
9 Interior Comfort Systems
10 P D X Heating & Cooling
11 Apcom Power Inc
12 Area Heating Inc
13 Four Seasons Heating Air Conditioning & Servic...
14 Perfect Climate Inc
15 Combustion Consultants Inc
16 Classic Heat Source, Inc.
17 Multnomah Heating, Inc
18 Apollo Plumbing, Heating & Air Conditioning - OR
19 Art's Furnace & Air Cond
20 Kurchel Heating
21 P & O Construction Inc
22 Systems Management NW
23 Bridgetown Heating
24 Amana Heating & Air Conditioning Systems
25 QualitySmith
26 Wilbert Jr, Wilson
27 Faith Heating & Air Conditioning Inc
28 Northwest Commercial Heating & Air Conditionin...
29 Heat Master Corp
dtype: string
when I print #1, #2, and #4
0 Honke Heating & Air Conditioning
1 Climate Kings Heating & Ac
2 Mike's Truck & Auto Service
3 One Hour Heating & Air Conditioning
4 Morgan Heating & Cooling Inc
5 Rnr Heating Venting & Air Conditioning
6 Universal HVAC Inc
7 Mr Furnace
8 Affordable Excellence Heating
9 Green Air Products
10 David Eugene Neketin
11 Century Heating & Air Cond
12 Appliance Wizard
13 Precision Energy Solutions Inc.
14 Portland Heating & Air Conditioning Co
15 Mhc
16 American Pride Heating and Cooling, LLC
17 Tri Star Western
18 Comfort Zone Heat & Air Inc
19 Don's Air-Care Inc
20 Chuck's Heating & Cooling
21 Chuck's Heating & Cooling
22 Mr. Furnace
23 Mt. Hood Heating Cooling & Refrigeration
24 America's Same Day Service
25 Arctic Commercial Refrigeration LLC
26 Apex Refrigeration
27 Ben's Heating & Air Conditioning LLC
28 David's Appliance Inc
29 Wolcott Heating & Cooling
dtype: string
The problem is you're treating pd.Series as a list, but the former are immutable while the later are mutable. This means, appending data to a list works like this:
lst = [1,2,3]
lst.append(4)
print(lst)
# [1, 2, 3, 4]
The object changes without having to explicitly assign it. If you do the same with Series, this happens:
series = pd.Series([1,2,3])
series.append(pd.Series([4]))
print(series)
The output is:
0 1
1 2
2 3
dtype: int64
So, to update a Series, you have to replace the original object or create a new one. If there's no assignment it won't be stored in memory:
series = pd.Series([1,2,3])
series = series.append(pd.Series([4]))
print(series)
Output:
0 1
1 2
2 3
0 4
dtype: int64
In the case of your problem it lies in the paginate function, you should change this line:
company_names1.append(company_names)
to:
company_names1 = company_names1.append(company_names)
And everything should work

Need help in matching strings from phrases from multiple columns of a dataframe in python

Need help in matching phrases in the data given below where I need to match phrases from both TextA and TextB.
The following code did not helped me in doing it how can I address this I had 100s of them to match
#sorting jumbled phrases
def sorts(string_value):
sorted_string = sorted(string_value.split())
sorted_string = ' '.join(sorted_string)
return sorted_string
#Removing punctuations in string
punc = '''!()-[]{};:'"\,<>./?##$%^&*_~'''
def punt(test_str):
for ele in test_str:
if ele in punc:
test_str = test_str.replace(ele, "")
return(test_str)
#matching strings
def lets_match(x):
for text1 in TextA:
for text2 in TextB:
try:
if sorts(punt(x[text1.casefold()])) == sorts(punt(x[text2.casefold()])):
return True
except:
continue
return False
df['result'] = df.apply(lets_match,axis =1)
even after implementing string sort, removing punctuations and case sensitivity I am still getting those strings as not matching. I am I missing something here can some help me in achieving it
Actually you can use difflib to match two text, here's what you can try:
from difflib import SequenceMatcher
def similar(a, b):
a=str(a).lower()
b=str(b).lower()
return SequenceMatcher(None, a, b).ratio()
def lets_match(d):
print(d[0]," --- ",d[1])
result=similar(d[0],d[1])
print(result)
if result>0.6:
return True
else:
return False
df["result"]=df.apply(lets_match,axis =1)
You can play with if result>0.6 value.
For more information about difflib you can visit here. There are other sequence matchers also like textdistance but I found it easy so I tried this.
Is there any issues with using the fuzzy match lib? The implementation is pretty straight forward and works well given the above data is relatively similar. I've performed the below without preprocessing.
import pandas as pd
""" Install the libs below via terminal:
$pip install fuzzywuzzy
$pip install python-Levenshtein
"""
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
#creating the data frames
text_a = ['AKIL KUMAR SINGH','OUSMANI DJIBO','PETER HRYB','CNOC LIMITED','POLY NOVA INDUSTRIES LTD','SAM GAWED JR','ADAN GENERAL LLC','CHINA MOBLE LIMITED','CASTAR CO., LTD.','MURAN','OLD SAROOP FOR CAR SEAT COVERS','CNP HEALTHCARE, LLC','GLORY PACK LTD','AUNCO VENTURES','INTERNATIONAL COMPANY','SAMEERA HEAT AND ENERGY FUND']
text_b = ['Singh, Akil Kumar','DJIBO, Ousmani Illiassou','HRYB, Peter','CNOOC LIMITED','POLYNOVA INDUSTRIES LTD.','GAWED, SAM','ADAN GENERAL TRADING FZE','CHINA MOBILE LIMITED','CASTAR GROUP CO., LTD.','MURMAN','Old Saroop for Car Seat Covers','CNP HEATHCARE, LLC','GLORY PACK LTD.','AUNCO VENTURE','INTL COMPANY','SAMEERA HEAT AND ENERGY PROPERTY FUND']
df_text_a = pd.DataFrame(text_a, columns=['text_a'])
df_text_b = pd.DataFrame(text_b, columns=['text_b'])
def lets_match(txt: str, chklist: list) -> str:
return process.extractOne(txt, chklist, scorer=fuzz.token_set_ratio)
#match Text_A against Text_B
result_txt_ab = df_text_a.apply(lambda x: lets_match(str(x), text_b), axis=1, result_type='expand')
result_txt_ab.rename(columns={0:'Return Match', 1:'Match Value'}, inplace=True)
df_text_a[result_txt_ab.columns]=result_txt_ab
df_text_a
text_a Return Match Match Value
0 AKIL KUMAR SINGH Singh, Akil Kumar 100
1 OUSMANI DJIBO DJIBO, Ousmani Illiassou 72
2 PETER HRYB HRYB, Peter 100
3 CNOC LIMITED CNOOC LIMITED 70
4 POLY NOVA INDUSTRIES LTD POLYNOVA INDUSTRIES LTD. 76
5 SAM GAWED JR GAWED, SAM 100
6 ADAN GENERAL LLC ADAN GENERAL TRADING FZE 67
7 CHINA MOBLE LIMITED CHINA MOBILE LIMITED 79
8 CASTAR CO., LTD. CASTAR GROUP CO., LTD. 81
9 MURAN SAMEERA HEAT AND ENERGY PROPERTY FUND 41
10 OLD SAROOP FOR CAR SEAT COVERS Old Saroop for Car Seat Covers 100
11 CNP HEALTHCARE, LLC CNP HEATHCARE, LLC 58
12 GLORY PACK LTD GLORY PACK LTD. 100
13 AUNCO VENTURES AUNCO VENTURE 56
14 INTERNATIONAL COMPANY INTL COMPANY 74
15 SAMEERA HEAT AND ENERGY FUND SAMEERA HEAT AND ENERGY PROPERTY FUND 86
#match Text_B against Text_A
result_txt_ba= df_text_b.apply(lambda x: lets_match(str(x), text_a), axis=1, result_type='expand')
result_txt_ba.rename(columns={0:'Return Match', 1:'Match Value'}, inplace=True)
df_text_b[result_txt_ba.columns]=result_txt_ba
df_text_b
text_b Return Match Match Value
0 Singh, Akil Kumar AKIL KUMAR SINGH 100
1 DJIBO, Ousmani Illiassou OUSMANI DJIBO 100
2 HRYB, Peter PETER HRYB 100
3 CNOOC LIMITED CNOC LIMITED 74
4 POLYNOVA INDUSTRIES LTD. POLY NOVA INDUSTRIES LTD 74
5 GAWED, SAM SAM GAWED JR 86
6 ADAN GENERAL TRADING FZE ADAN GENERAL LLC 86
7 CHINA MOBILE LIMITED CHINA MOBLE LIMITED 81
8 CASTAR GROUP CO., LTD. CASTAR CO., LTD. 100
9 MURMAN ADAN GENERAL LLC 33
10 Old Saroop for Car Seat Covers OLD SAROOP FOR CAR SEAT COVERS 100
11 CNP HEATHCARE, LLC CNP HEALTHCARE, LLC 56
12 GLORY PACK LTD. GLORY PACK LTD 100
13 AUNCO VENTURE AUNCO VENTURES 53
14 INTL COMPANY INTERNATIONAL COMPANY 50
15 SAMEERA HEAT AND ENERGY PROPERTY FUND SAMEERA HEAT AND ENERGY FUND 100
I think you can't do it without a strings distance notion, what you can do is use, for example record linkage.
I will not get into details, but i'll show you an example of usage on this case.
import pandas as pd
import recordlinkage as rl
from recordlinkage.preprocessing import clean
# creating first dataframe
df_text_a = pd.DataFrame({
"Text A":[
"AKIL KUMAR SINGH",
"OUSMANI DJIBO",
"PETER HRYB",
"CNOC LIMITED",
"POLY NOVA INDUSTRIES LTD",
"SAM GAWED JR",
"ADAN GENERAL LLC",
"CHINA MOBLE LIMITED",
"CASTAR CO., LTD.",
"MURAN",
"OLD SAROOP FOR CAR SEAT COVERS",
"CNP HEALTHCARE, LLC",
"GLORY PACK LTD",
"AUNCO VENTURES",
"INTERNATIONAL COMPANY",
"SAMEERA HEAT AND ENERGY FUND"]
}
)
# creating second dataframe
df_text_b = pd.DataFrame({
"Text B":[
"Singh, Akil Kumar",
"DJIBO, Ousmani Illiassou",
"HRYB, Peter",
"CNOOC LIMITED",
"POLYNOVA INDUSTRIES LTD. ",
"GAWED, SAM",
"ADAN GENERAL TRADING FZE",
"CHINA MOBILE LIMITED",
"CASTAR GROUP CO., LTD.",
"MURMAN ",
"Old Saroop for Car Seat Covers",
"CNP HEATHCARE, LLC",
"GLORY PACK LTD.",
"AUNCO VENTURE",
"INTL COMPANY",
"SAMEERA HEAT AND ENERGY PROPERTY FUND"
]
}
)
# preprocessing in very important on results, you have to find which fit well on yuor problem.
cleaned_a = pd.DataFrame(clean(df_text_a["Text A"], lowercase=True))
cleaned_b = pd.DataFrame(clean(df_text_b["Text B"], lowercase=True))
# creating an indexing which will be used for comprison, you have various type of indexing, watch documentation.
indexer = rl.Index()
indexer.full()
# generating all passible pairs
pairs = indexer.index(cleaned_a, cleaned_b)
# starting evaluation phase
compare = rl.Compare(n_jobs=-1)
compare.string("Text A", "Text B", method='jarowinkler', label = 'text')
matches = compare.compute(pairs, cleaned_a, cleaned_b)
matches is now a MultiIndex DataFrame, what you want to do next is to find all max on the second index by first index. So you will have the results you need.
Results can be improved working on distance, indexing and/or preprocessing.

Scrape website to only show populated categories

I am in the process of scraping a website and it pulls the contents of the page, but there are categories with headers that are technically empty, but it still shows the header. I would like to only see categories with events in them. Ideally I could even have the components of each transactions so I can choose which elements I want displayed.
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
print('Scraping NH Dept of Banking...')
print()
NHurl = 'https://www.nh.gov/banking/corporate-activities/index.htm'
NHr = requests.get(NHurl, headers = headers)
NHsoup = BeautifulSoup(NHr.text, 'html.parser')
NHlist = []
for events in NHsoup.findAll('tr')[2:]:
print(events.text)
NHlist.append(events.text)
print(' '.join(NHlist))
Like I said, this works to get all of the information, but there are a lot of headers/empty space that doesn't need to be pulled. For example, at the time I'm writing this the 'acquisitions', 'conversions', and 'change in control' are empty, but the headers still come in and there's are relatively large blank space after the headers. I feel like a I need some sort of loop to go through each header ('td') and then get it's contents ('tr') but I'm just not quite sure how to do it.
You can use itertools.groupby to group elements and then filter out empty rows:
import requests
from itertools import groupby
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
print('Scraping NH Dept of Banking...')
print()
NHurl = 'https://www.nh.gov/banking/corporate-activities/index.htm'
NHr = requests.get(NHurl, headers = headers)
NHsoup = BeautifulSoup(NHr.text, 'html.parser')
NHlist = []
for _, g in groupby(NHsoup.select('tr'), lambda k, d={'g':0}: (d.update(g=d['g']+1), d['g']) if k.select('th') else (None, d['g'])):
s = [tag.get_text(strip=True, separator=' ') for tag in g]
if any(i == '' for i in s):
continue
NHlist.append(s)
# This is just pretty printing, all the data are already in NHlist:
l = max(map(len,(j for i in NHlist for j in i))) + 5
for item in NHlist:
print('{: <4} {}'.format(' ', item[0]))
print('-' * l)
for i, ev in enumerate(item[1:], 1):
print('{: <4} {}'.format(i, ev))
print()
Prints:
Scraping NH Dept of Banking...
New Bank
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 12/11/18 The Millyard Bank
Interstate Bank Combination
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 01/16/19 Optima Bank & Trust Company with and into Cambridge Trust Company Portsmouth, NH 03/29/19
Amendment to Articles of Agreement or Incorporation; Business or Capital Plan
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 11/26/18 John Hancock Trust Company Boston, MA 01/14/19
2 12/04/18 Franklin Savings Bank Franklin, NH 01/28/19
3 12/12/18 MFS Heritage Trust Company Boston, MA 01/28/19
4 02/25/19 Ankura Trust Company, LLC Fairfield, CT 03/22/19
5 4/25/19 Woodsville Guaranty Savings Bank Woodsville, NH 06/04/19
6 5/10/19 AB Trust Company New York, NY 06/04/19
Reduction in Capital
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 03/07/19 Primary Bank Bedford, NH 04/10/19
Amendment to Bylaws
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 12/10/18 Northeast Credit Union Porstmouth, NH 02/25/19
2 2/25/19 Members First Credit Union Manchester, NH 04/05/19
3 4/24/19 St. Mary's Bank Manchester, NH 05/30/19
4 6/28/19 Bellwether Community Credit Union
Interstate Branch Office
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 01/23/19 Newburyport Five Cents Savings Bank 141 Portsmouth Ave Exeter, NH 02/01/19
2 03/08/19 One Credit Union Newport, NH 03/29/19
3 03/01/19 JPMorgan Chase Bank, NA Nashua, NH 04/04/19
4 03/26/19 Mascoma Bank Lebanon, NH 04/09/19
5 04/24/19 Newburyport Five Cents Savings Bank 321 Lafayette Rd Hampton NH 05/08/19
6 07/10/19 Mascoma Bank 242-244 North Winooski Avenue Burlington VT 07/18/19
7 07/10/19 Mascoma Bank 431 Pine Street Burlington VT 07/18/19
Interstate Branch Office Closure
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 02/15/19 The Provident Bank 321 Lafayette Rd Hampton, NH 02/25/19
New Branch Office
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 12/07/18 Bank of New Hampshire 16-18 South Main Street Concord NH 01/02/19
2 3/4/19 Triangle Credit Union 360 Daniel Webster Highway, Merrimack, NH 03/11/19
3 04/03/19 Bellwether Community Credit Union 425-453 Commercial Street Manchester, NH 04/17/19
4 06/11/19 Primary Bank 23 Crystal Avenue Derry NH 06/11/19
Branch Office Closure
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 5/15/19 Northeast Credit Union Merrimack, NH 05/21/19
New Loan Production Office
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 04/08/19 Community National Bank 367 Route 120, Unit B-5 Lebanon, NH
03766-1430 04/15/19
Loan Production Office Closure
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 07/22/19 The Provident Bank 20 Trafalgar Square, Suite 447 Nashua NH 03063 07/31/19
Trade Name Requests
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 04/16/19 John Hancock Trust Company To use trade name "Manulife Investment Management Trust Company" 04/24/19
New Trust Company
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 02/19/19 Janney Trust Co., LLC
2 02/25/19 Darwin Trust Company of New Hampshire, LLC
3 07/15/`9 Harbor Trust Company
Dissolution of Trust Company
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 09/19/17 Cambridge Associates Fiduciary Trust, LLC Boston, MA 02/05/19
Trust Office Closure
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 5/10/19 Charter Trust Company Rochester, NH 05/20/19
New Trust Office
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 02/25/19 Ankura Trust Company, LLC 140 Sherman Street, 4th Floor Fairfield, CT 06824 03/22/19
Relocation of Trust Office
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 01/23/19 Geode Capital Management Trust Company, LLC Relocate from: One Post Office Square, 20th Floor, Boston MA To: 100 Summer Street, 12th Flr, Boston, MA 02/01/19
2 03/15/19 Drivetrain Trust Company LLC Relocate from: 630 3rd Avenue, 21st Flr New York, NY 10017 To: 410 Park Avenue, Suite 900 New York, NY 10022 03/29/19
3 04/14/19 Boston Partners Trust Company Relocate from: 909 Third Avenue New York, NY 10022 To: One Grand Central Place 60 East 42nd Street, Ste 1550 New York, NY 10165 04/23/19
You could test which rows contain all '\xa0' (appear blank) and exclude. I append to list and convert to pandas dataframe but you could just print the row direct.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
r = requests.get('https://www.nh.gov/banking/corporate-activities/index.htm')
soup = bs(r.content, 'lxml')
results = []
for tr in soup.select('tr'):
row = [i.text for i in tr.select('th,td')]
if row.count('\xa0') != len(row):
results.append(row)
pd.set_option('display.width', 100)
df = pd.DataFrame(results)
df.style.set_properties(**{'text-align': 'left'})
df.columns = df.iloc[0]
df = df[1:]
df.fillna(value='', inplace=True)
print(df.head(20))
Not sure if this is how you want it, and there is probably a more elegant way, but I basically did was
Pandas to get the table
Pandas automatically assigns columns, so moved column to first row
Found were rows are all nulls
Dropped rows with all nulls and the previous row (it's sub header)
import pandas as pd
print('Scraping NH Dept of Banking...')
print()
NHurl = 'https://www.nh.gov/banking/corporate-activities/index.htm'
df = pd.read_html(NHurl)[0]
top_row = pd.DataFrame([df.columns], index=[-1])
df.columns = top_row.columns
df = df.append(top_row, sort=True).sort_index().reset_index(drop=True)
null_rows = df[df.isnull().values.all(axis=1)].index.tolist()
drop_hdr_rows = [x - 1 for x in null_rows ]
drop_rows = drop_hdr_rows + null_rows
new_df = df[~df.index.isin(drop_rows)]
Output:
print (new_df.to_string())
0 1 2 3
2 New Bank New Bank New Bank New Bank
3 12/11/18 The Millyard Bank NaN NaN
4 Interstate Bank Combination Interstate Bank Combination Interstate Bank Combination Interstate Bank Combination
5 01/16/19 Optima Bank & Trust Company with and into Camb... Portsmouth, NH 03/29/19
12 Amendment to Articles of Agreement or Incorpor... Amendment to Articles of Agreement or Incorpor... Amendment to Articles of Agreement or Incorpor... Amendment to Articles of Agreement or Incorpor...
13 11/26/18 John Hancock Trust Company Boston, MA 01/14/19
14 12/04/18 Franklin Savings Bank Franklin, NH 01/28/19
15 12/12/18 MFS Heritage Trust Company Boston, MA 01/28/19
16 02/25/19 Ankura Trust Company, LLC Fairfield, CT 03/22/19
17 4/25/19 Woodsville Guaranty Savings Bank Woodsville, NH 06/04/19
18 5/10/19 AB Trust Company New York, NY 06/04/19
19 Reduction in Capital Reduction in Capital Reduction in Capital Reduction in Capital
20 03/07/19 Primary Bank Bedford, NH 04/10/19
21 Amendment to Bylaws Amendment to Bylaws Amendment to Bylaws Amendment to Bylaws
22 12/10/18 Northeast Credit Union Porstmouth, NH 02/25/19
23 2/25/19 Members First Credit Union Manchester, NH 04/05/19
24 4/24/19 St. Mary's Bank Manchester, NH 05/30/19
25 6/28/19 Bellwether Community Credit Union NaN NaN
26 Interstate Branch Office Interstate Branch Office Interstate Branch Office Interstate Branch Office
27 01/23/19 Newburyport Five Cents Savings Bank 141 Portsmouth Ave Exeter, NH 02/01/19
28 03/08/19 One Credit Union Newport, NH 03/29/19
29 03/01/19 JPMorgan Chase Bank, NA Nashua, NH 04/04/19
30 03/26/19 Mascoma Bank Lebanon, NH 04/09/19
31 04/24/19 Newburyport Five Cents Savings Bank 321 Lafayette Rd Hampton NH 05/08/19
32 07/10/19 Mascoma Bank 242-244 North Winooski Avenue Burlington VT 07/18/19
33 07/10/19 Mascoma Bank 431 Pine Street Burlington VT 07/18/19
34 Interstate Branch Office Closure Interstate Branch Office Closure Interstate Branch Office Closure Interstate Branch Office Closure
35 02/15/19 The Provident Bank 321 Lafayette Rd Hampton, NH 02/25/19
36 New Branch Office New Branch Office New Branch Office New Branch Office
37 12/07/18 Bank of New Hampshire 16-18 South Main Street Concord NH 01/02/19
38 3/4/19 Triangle Credit Union 360 Daniel Webster Highway, Merrimack, NH 03/11/19
39 04/03/19 Bellwether Community Credit Union 425-453 Commercial Street Manchester, NH 04/17/19
40 06/11/19 Primary Bank 23 Crystal Avenue Derry NH 06/11/19
41 Branch Office Closure Branch Office Closure Branch Office Closure Branch Office Closure
42 5/15/19 Northeast Credit Union Merrimack, NH 05/21/19
43 New Loan Production Office New Loan Production Office New Loan Production Office New Loan Production Office
44 04/08/19 Community National Bank 367 Route 120, Unit B-5 Lebanon, NH 03766-1430 04/15/19
45 Loan Production Office Closure Loan Production Office Closure Loan Production Office Closure Loan Production Office Closure
46 07/22/19 The Provident Bank 20 Trafalgar Square, Suite 447 Nashua NH 03063 07/31/19
51 Trade Name Requests Trade Name Requests Trade Name Requests Trade Name Requests
52 04/16/19 John Hancock Trust Company To use trade name "Manulife Investment Managem... 04/24/19
53 New Trust Company New Trust Company New Trust Company New Trust Company
54 02/19/19 Janney Trust Co., LLC NaN NaN
55 02/25/19 Darwin Trust Company of New Hampshire, LLC NaN NaN
56 07/15/`9 Harbor Trust Company NaN NaN
57 Dissolution of Trust Company Dissolution of Trust Company Dissolution of Trust Company Dissolution of Trust Company
58 09/19/17 Cambridge Associates Fiduciary Trust, LLC Boston, MA 02/05/19
59 Trust Office Closure Trust Office Closure Trust Office Closure Trust Office Closure
60 5/10/19 Charter Trust Company Rochester, NH 05/20/19
61 New Trust Office New Trust Office New Trust Office New Trust Office
62 02/25/19 Ankura Trust Company, LLC 140 Sherman Street, 4th Floor Fairfield, CT 0... 03/22/19
63 Relocation of Trust Office Relocation of Trust Office Relocation of Trust Office Relocation of Trust Office
64 01/23/19 Geode Capital Management Trust Company, LLC Relocate from: One Post Office Square, 20th Fl... 02/01/19
65 03/15/19 Drivetrain Trust Company LLC Relocate from: 630 3rd Avenue, 21st Flr New Y... 03/29/19
66 04/14/19 Boston Partners Trust Company Relocate from: 909 Third Avenue New York, NY ... 04/23/19

How to Nest If Statement Within For Loop When Scraping Div Class HTML

Below is a scraper that uses Beautiful Soup to scrape physician information off of this webpage. As you can see from the html code directly below, each physician has an individual profile on the webpage that displays the physician's name, clinic, profession, taxonomy, and city.
<div class="views-field views-field-title practitioner__name" >Marilyn Adams</div>
<div class="views-field views-field-field-pract-clinic practitioner__clinic" >Fortius Sport & Health</div>
<div class="views-field views-field-field-pract-profession practitioner__profession" >Physiotherapist</div>
<div class="views-field views-field-taxonomy-vocabulary-5 practitioner__region" >Fraser River Delta</div>
<div class="views-field views-field-city practitioner__city" ></div>
As you can see from the sample html code, the physician profiles occasionally have information missing. If this occurs, I would like the scraper to print 'N/A'. I need the scraper to print 'N/A' because I would eventually like to put each div class category (name, clinic, profession, etc.) into an array where the lengths of each column are exactly the same so I can properly export the data to a CSV file. Here is an example of what I want the output to look like compared to what is actually showing up.
Actual Expected
[Names] [Names]
Greg Greg
Bob Bob
[Clinic] [Clinic]
Sport/Health Sport/Health
N/A
[Profession] [Profession]
Physical Therapist Physical Therapist
Physical Therapist Physical Therapist
[Taxonomy] [Taxonomy]
Fraser River Fraser River
N/A
[City] [City]
Vancouver Vancouver
Vancouver Vancouver
I have tried writing an if statement nested within each for loop, but the code does not seem to be looping correctly as the "N/A" only shows up once for each div class section. Does anyone know how to properly nest an if statement with a for loop so I am getting the proper amount of "N/As" in each column? Thanks in advance!
import requests
import re
from bs4 import BeautifulSoup
page=requests.get('https://sportmedbc.com/practitioners')
soup=BeautifulSoup(page.text, 'html.parser')
#Find Doctor Info
for doctor in soup.find_all('div',attrs={'class':'views-field views-field-title practitioner__name'}):
for a in doctor.find_all('a'):
print(a.text)
for clinic_name in soup.find_all('div',attrs={'class':'views-field views-field-field-pract-clinic practitioner__clinic'}):
for b in clinic_name.find_all('a'):
if b==(''):
print('N/A')
profession_links=soup.findAll('div',attrs={'class':'views-field views-field-field-pract-profession practitioner__profession'})
for profession in profession_links:
if profession.text==(''):
print('N/A')
print(profession.text)
taxonomy_links=soup.findAll('div',attrs={'class':'views-field views-field-taxonomy-vocabulary-5 practitioner__region'})
for taxonomy in taxonomy_links:
if taxonomy.text==(''):
print('N/A')
print(taxonomy.text)
city_links=soup.findAll('div',attrs={'class':'views-field views-field-taxonomy-vocabulary-5 practitioner__region'})
for city in city_links:
if city.text==(''):
print('N/A')
print(city.text)
For this problem you can use ChainMap from collections module (docs here). That way you can define your default values, in this case 'n/a' and only grab information that exists for each doctor:
from bs4 import BeautifulSoup
import requests
from collections import ChainMap
url = 'https://sportmedbc.com/practitioners'
soup = BeautifulSoup(requests.get(url).text, 'lxml')
def get_data(soup):
default_data = {'name': 'n/a', 'clinic': 'n/a', 'profession': 'n/a', 'region': 'n/a', 'city': 'n/a'}
for doctor in soup.select('.view-practitioners .practitioner'):
doctor_data = {}
if doctor.select_one('.practitioner__name').text.strip():
doctor_data['name'] = doctor.select_one('.practitioner__name').text
if doctor.select_one('.practitioner__clinic').text.strip():
doctor_data['clinic'] = doctor.select_one('.practitioner__clinic').text
if doctor.select_one('.practitioner__profession').text.strip():
doctor_data['profession'] = doctor.select_one('.practitioner__profession').text
if doctor.select_one('.practitioner__region').text.strip():
doctor_data['region'] = doctor.select_one('.practitioner__region').text
if doctor.select_one('.practitioner__city').text.strip():
doctor_data['city'] = doctor.select_one('.practitioner__city').text
yield ChainMap(doctor_data, default_data)
for doctor in get_data(soup):
print('name:\t\t', doctor['name'])
print('clinic:\t\t',doctor['clinic'])
print('profession:\t',doctor['profession'])
print('city:\t\t',doctor['city'])
print('region:\t\t',doctor['region'])
print('-' * 80)
Prints:
name: Jaimie Ackerman
clinic: n/a
profession: n/a
city: n/a
region: n/a
--------------------------------------------------------------------------------
name: Marilyn Adams
clinic: Fortius Sport & Health
profession: Physiotherapist
city: n/a
region: Fraser River Delta
--------------------------------------------------------------------------------
name: Mahsa Ahmadi
clinic: Wellpoint Acupuncture (Sports Medicine)
profession: Acupuncturist
city: Vancouver
region: Vancouver & Sea to Sky
--------------------------------------------------------------------------------
name: Tracie Albisser
clinic: Pacific Sport Northern BC, Tracie Albisser
profession: Strength and Conditioning Specialist, Exercise Physiologist
city: n/a
region: Cariboo - North East
--------------------------------------------------------------------------------
name: Christine Alder
clinic: n/a
profession: n/a
city: Vancouver
region: Vancouver & Sea to Sky
--------------------------------------------------------------------------------
name: Steacy Alexander
clinic: Go! Physiotherapy Sports and Wellness Centre
profession: Physiotherapist
city: Vancouver
region: Vancouver & Sea to Sky
--------------------------------------------------------------------------------
name: Page Allison
clinic: AET Clinic, .
profession: Athletic Therapist
city: Victoria
region: Vancouver Island - Central Coast
--------------------------------------------------------------------------------
name: Dana Alumbaugh
clinic: n/a
profession: Podiatrist
city: Squamish
region: Vancouver & Sea to Sky
--------------------------------------------------------------------------------
name: Manouch Amel
clinic: Mountainview Kinesiology Ltd.
profession: Strength and Conditioning Specialist
city: Anmore
region: Vancouver & Sea to Sky
--------------------------------------------------------------------------------
name: Janet Ames
clinic: Dr. Janet Ames
profession: Physician
city: Prince George
region: Cariboo - North East
--------------------------------------------------------------------------------
name: Sandi Anderson
clinic: n/a
profession: n/a
city: Coquitlam
region: Fraser Valley
--------------------------------------------------------------------------------
name: Greg Anderson
clinic: University of the Fraser Valley
profession: Exercise Physiologist
city: Mission
region: Fraser Valley
--------------------------------------------------------------------------------
EDIT:
For getting the output in columns, you can use this example:
def print_data(header_text, data, key):
print(header_text)
for d in data:
print(d[key])
print()
data = list(get_data(soup))
print_data('[Names]', data, 'name')
print_data('[Clinic]', data, 'clinic')
print_data('[Profession]', data, 'profession')
print_data('[Taxonomy]', data, 'region')
print_data('[City]', data, 'city')
This prints:
[Names]
Jaimie Ackerman
Marilyn Adams
Mahsa Ahmadi
Tracie Albisser
Christine Alder
Steacy Alexander
Page Allison
Dana Alumbaugh
Manouch Amel
Janet Ames
Sandi Anderson
Greg Anderson
[Clinic]
n/a
Fortius Sport & Health
Wellpoint Acupuncture (Sports Medicine)
Pacific Sport Northern BC, Tracie Albisser
n/a
Go! Physiotherapy Sports and Wellness Centre
AET Clinic, .
n/a
Mountainview Kinesiology Ltd.
Dr. Janet Ames
n/a
University of the Fraser Valley
[Profession]
n/a
Physiotherapist
Acupuncturist
Strength and Conditioning Specialist, Exercise Physiologist
n/a
Physiotherapist
Athletic Therapist
Podiatrist
Strength and Conditioning Specialist
Physician
n/a
Exercise Physiologist
[Taxonomy]
n/a
Fraser River Delta
Vancouver & Sea to Sky
Cariboo - North East
Vancouver & Sea to Sky
Vancouver & Sea to Sky
Vancouver Island - Central Coast
Vancouver & Sea to Sky
Vancouver & Sea to Sky
Cariboo - North East
Fraser Valley
Fraser Valley
[City]
n/a
n/a
Vancouver
n/a
Vancouver
Vancouver
Victoria
Squamish
Anmore
Prince George
Coquitlam
Mission

Having trouble merging two dataframes in python

I am new to Python and I am trying to merge two datasets for my research together:
df1 has the column names: companyname, ticker, and Dscode,
df2 has companyname, ticker, grouptcode, and Dscode.
I want to merge the grouptcode from df1 to df2, however, the companyname is slightly different, but very similar between the two dataframes.
For each ticker, there is an associated Dscode. However, multiple companies have the same ticker, and therefore the same Dscode.
Problem
I am only interested in merging the grouptcode for the associated ticker and Dscode that matches the companyname (which at times is slightly different - this part is what I cannot get past). The code I have been using is below.
Code
import pandas as pd
import os
# set working directory
path = "/Users/name/Desktop/Python"
os.chdir(path)
os.getcwd() # Prints the working directory
# read in excel file
file = "/Users/name/Desktop/Python/Excel/DSROE.xlsx"
x1 = pd.ExcelFile(file)
print(x1.sheet_names)
df1 = x1.parse('Sheet1')
df1.head()
df1.tail()
file2 = "/Users/name/Desktop/Python/Excel/tcode2.xlsx"
x2 = pd.ExcelFile(file2)
print(x2.sheet_names)
df2 = x2.parse('Sheet1')
df2['companyname'] = df2['companyname'].str.upper() ## make column uppercase
df2.head()
df2.tail()
df2 = df2.dropna()
x3 = pd.merge(df1, df2,how = 'outer') # merge
Data
df1
Dscode ticker companyname
65286 8933TC 3pl 3P LEARNING LIMITED
79291 9401FP a2m A2 MILK COMPANY LIMITED
1925 14424Q aac AUSTRALIAN AGRICULTURAL COMPANY LIMITED
39902 675493 aad ARDENT LEISURE GROUP
1400 133915 aba AUSWIDE BANK LIMITED
74565 922472 abc ADELAIDE BRIGHTON LIMITED
7350 26502C abp ABACUS PROPERTY GROUP
39202 675142 ada ADACEL TECHNOLOGIES LIMITED
80866 9661AD adh ADAIRS
80341 9522QV afg AUSTRALIAN FINANCE GROUP LIMITED
45327 691938 agg ANGLOGOLD ASHANTI LIMITED
2625 14880E agi AINSWORTH GAME TECHNOLOGY LIMITED
75090 923040 agl AGL ENERGY LIMITED
19251 29897X ago ATLAS IRON LIMITED
64409 890588 agy ARGOSY MINERALS LIMITED
24151 31511D ahg AUTOMOTIVE HOLDINGS GROUP LIMITED
64934 8917JD ahy ASALEO CARE LIMITED
42877 691152 aia AUCKLAND INTERNATIONAL AIRPORT LIMITED
61433 88013C ajd ASIA PACIFIC DATA CENTRE GROUP
44452 691704 ajl AJ LUCAS GROUP LIMITED
700 13288C ajm ALTURA MINING LIMITED
19601 29929D akp AUDIO PIXELS HOLDINGS LIMITED
79816 951404 alk ALKANE RESOURCES LIMITED
56008 865613 all ARISTOCRAT LEISURE LIMITED
51807 771351 alq ALS LIMITED
44277 691685 alu ALTIUM LIMITED
42702 68625C alx ATLAS ARTERIA GROUP
30101 41162F ama AMA GROUP LIMITED
67386 902201 amc AMCOR LIMITED
33426 50431L ami AURELIA METALS LIMITED
df2
companyname grouptcode ticker
524 3P LEARNING LIMITED.. tpn1 3pl
1 THE A2 MILK COMPANY LIMITED a2m1 a2m
2 AUSTRALIAN AGRICULTURAL COMPANY LIMITED. aac2 aac
3 AAPC LIMITED. aad1 aad
6 ADVANCE BANK AUSTRALIA LIMITED aba1 aba
7 ADELAIDE BRIGHTON CEMENT HOLDINGS LIMITED abc1 abc
8 ABACUS PROPERTY GROUP abp1 abp
9 ADACEL TECHNOLOGIES LIMITED ada1 ada
288 ADA CORPORATION LIMITED khs1 ada
10 AERODATA HOLDINGS LIMITED adh1 adh
11 ADAMS (HERBERT) HOLDINGS LIMITED adh2 adh
12 ADAIRS LIMITED adh3 adh
431 ALLCO FINANCE GROUP LIMITED rcd1 afg
13 AUSTRALIAN FINANCE GROUP LTD afg1 afg
14 ANGLOGOLD ASHANTI LIMITED agg1 agg
15 APGAR INDUSTRIES LIMITED agi1 agi
16 AINSWORTH GAME TECHNOLOGY LIMITED agi2 agi
17 AUSTRALIAN GAS LIGHT COMPANY (THE) agl1 agl
18 ATLAS IRON LIMITED ago1 ago
393 ACM GOLD LIMITED pgo2 ago
19 AUSTRALIAN GYPSUM INDUSTRIES LIMITED agy1 agy
142 ARGOSY MINERALS INC cio1 agy
21 ARCHAEAN GOLD NL ahg1 ahg
22 AUSTRALIAN HYDROCARBONS N.L. ahy1 ahy
23 ASALEO CARE LIMITED ahy2 ahy
24 AUCKLAND INTERNATIONAL AIRPORT LIMITED aia1 aia
25 ASIA PACIFIC DATA CENTRE GROUP ajd1 ajd
26 AJ LUCAS GROUP LIMITED ajl1 ajl
27 AJAX MCPHERSON'S LIMITED ajm1 ajm
29 ALKANE EXPLORATION (TERRIGAL) N.L. alk1 alk
Dscode
524 8933TC
1 9401FP
2 14424Q
3 675493
6 133915
7 922472
8 26502C
9 675142
288 675142
10 9661AD
11 9661AD
12 9661AD
431 9522QV
13 9522QV
14 691938
15 14880E
16 14880E
17 923040
18 29897X
393 29897X
19 890588
142 890588
21 31511D
22 8917JD
23 8917JD
24 691152
25 88013C
26 691704
27 13288C
29 951404

Categories