How to append data in data frame using beautiful soup - python

import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl='https://locations.atipt.com/'
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r =requests.get('https://locations.atipt.com/al')
soup=BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('ul',class_='list-unstyled')
productlinks=[]
for links in tra:
for link in links.find_all('a',href=True):
comp=baseurl+link['href']
productlinks.append(comp)
temp=[]
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
tag=soup.find_all('div',class_='listing content-card')
for pro in tag:
for tup in pro.find_all('p'):
temp.append([text for text in tup.stripped_strings])
df = pd.DataFrame(temp)
print(df)
This is the output I get
9256 Parkway E Ste A
Birmingham,
Alabama 35206
but I doesn't how to give the name in data frame I give name address to 9256 Parkway ESte A and City to Birmingham and state to ALabama 35206 if it is possible that kindly help in these matter

temp=[]
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
tag=soup.find_all('div',class_='listing content-card')
for pro in tag:
data=[tup.text for tup in pro.find_all('p')]
address="".join(data[:2])
splitdata=data[2].split(",")
city=splitdata[0]
splitsecond=splitdata[-1].split("\xa0")
state=splitsecond[0]
postalcode=splitsecond[-1]
temp.append([address,city,state])
import pandas as pd
df=pd.DataFrame(temp,columns=["Address","City","State"])
df
Output:
Address City State Postalcode
0 634 1st Street NSte 100 Alabaster AL 35007
1 9256 Parkway ESte A Birmingham AL 35206
....
If you want to add call details just add this statement after postalcode
callNumber=pro.find("span",class_="directory-phone").get_text(strip=True).split("\n")[-1].lstrip()
and append this to temp list

Related

Zillow returns first 7 properties

Trying to make a long story short so I apologize in advance, feel free to ask more questions for clarity.
Essentially I am trying to make a web scraping script that takes info from Zillow and puts it into a pandas data frame so that I can learn both pandas and beautifulsoup4 in the process. I am trying to avoid using the Zillow API but it seems it might be my only option. So, when I scrape the location the user inputs, it only returns 7 properties. I was told this is because of the Javascript Zillow uses ("Lazy-loading" or "infinite scrolling".) Basically the other properties aren't loaded until the user scrolls. I tried using selenium instead of requests but I end up getting bot verification captcha'd. I tried using headers and everything but cant seem to figure out a solution other than the API.
Here's my code BEFORE using selenium (aka when it semi-worked):
from bs4 import BeautifulSoup
import pandas as pd
from uszipcode import SearchEngine
import requests, prettify
search = SearchEngine()
zipcode = input("What is your zipcode: ")
zipcode_info = search.by_zipcode(zipcode)
headers = {
'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding' : 'en-US,en;0.8',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
with requests.Session() as session:
url= "https://www.zillow.com/homes/for_sale/" + zipcode_info.major_city + "/"
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
soup.prettify()
df = pd.DataFrame()
address = list()
price = list()
bed_bath = list()
links = list()
properties = soup.find_all("li", attrs={"class": "ListItem-c11n-8-73-8__sc-10e22w8-0 srp__hpnp3q-0 enEXBq with_constellation"})
for li in properties:
try:
address.append(li.find("a", attrs = {"class": "StyledPropertyCardDataArea-c11n-8-73-8__sc-yipmu-0 lhIXlm property-card-link"}).text)
except:
pass
try:
price.append(li.find("span", attrs = {"data-test": "property-card-price"}).text)
except:
pass
try:
span = (li.find("span", attrs = {"class": "StyledPropertyCardHomeDetails-c11n-8-73-8__sc-1mlc4v9-0 jlVIIO"}))
for subspan in span:
bed_bath.append(subspan.find("b").text)
except:
pass
try:
links.append( (li.find("a", attrs = {"data-test": "property-card-link"}).get("href")) )
except:
pass
df['Address'] = address
df['Price'] = price
df['Links'] = links
print (df)
And the output is:
Address Price Links
0 525 W River Dr, Pennsauken, NJ 08110 $259,900 https://www.zillow.com/homedetails/525-W-River...
1 7519 Remington, Merchantville, NJ 08109 $270,000 https://www.zillow.com/homedetails/7519-Reming...
2 2269 Marlon Ave, Pennsauken, NJ 08110 $220,000 https://www.zillow.com/homedetails/2269-Marlon...
3 8129 River Rd, Pennsauken, NJ 08110 $324,999 https://www.zillow.com/homedetails/8129-River-...
4 1653 Springfield Ave, Pennsauken, NJ 08110 $259,900 https://www.zillow.com/homedetails/1653-Spring...
5 5531 Jackson Ave, Pennsauken, NJ 08110 $265,000 https://www.zillow.com/homedetails/5531-Jackso...
6 8141 Stow Rd, Pennsauken, NJ 08110 $359,000 https://www.zillow.com/homedetails/8141-Stow-R...
7 2203 42nd St, Pennsauken, NJ 08110 $275,000 https://www.zillow.com/homedetails/2203-42nd-S...

Scraping returning None

I am trying to scrape yellow pages everything working fine except scraping the phone numbers! it's a div class = 'popover-phones' but having an a tag with href = the phone number can anyone assist me please. yellow pages inspection
import item as item
import requests
from bs4 import BeautifulSoup
import json
from csv import writer
url = 'https://yellowpages.com.eg/en/category/charcoal'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
articles = soup.find_all('div', class_= 'col-xs-12 item-details')
for item in articles:
address = item.find('a',class_= 'address-text').text
company = item.find('a',class_= 'item-title').text
telephone = item.find('div', class_='popover-phones')enter code here
print(company,address,telephone)
The phone numbers you see are loaded from external URL. To get all phone numbers from the page you can use next example:
import requests
from bs4 import BeautifulSoup
url = "https://yellowpages.com.eg/en/category/charcoal"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for p in soup.select("[data-tooltip-phones]"):
phone_url = "https://yellowpages.com.eg" + p["data-tooltip-phones"]
title = p.find_previous(class_="item-title").text
phones = requests.get(phone_url).json()
print(title, *[b for a in phones for b in a])
Prints:
2 Bacco 02-3390-8764
3 A Group International 0120-3530-005 057-2428-449
3 A Group International 0120-3833-500 0120-3530-005
Abdel Karim 0122-3507-461
Abdel Sabour Zidan 03-4864-641
Abou Aoday 0111-9226-536 0100-3958-351
Abou Eid For Charcoal Trading 0110-0494-770
Abou Fares For Charcoal Trade 0128-3380-916
Abou Karim Store 0100-6406-939
Adel Sons 0112-1034-398 0115-0980-776
Afandina 0121-2414-087
Ahmed El Fahham 02-2656-0815
Al Baraka For Charcoal 0114-6157-799 0109-3325-720
Al Ghader For Import & Export 03-5919-355 0111-0162-602 0120-6868-434
Al Mashd For Coal 0101-0013-743 0101-0013-743
Al Zahraa Co. For Exporting Charcoal & Agriculture Products 040-3271-056 0100-0005-174 040-3271-056
Alex Carbon Group 03-3935-902
Alwaha Charcoal Trade Est. 0100-4472-554 0110-1010-810 0100-9210-812
Aly Abdel Rahman For Charcoal Trade 03-4804-440 0122-8220-661
Amy Deluxe Egypt 0112-5444-410

Scrape data using beautifulsoup

I am extracting the data they give repeat name and surname in each entry how ever the name and surname is different for each entry these is page link https://www.aeafa.es/asociados.php
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
temp = []
wev={}
for page in range(1, 5):
r = requests.get(
"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(r.content, "lxml")
details=soup.find('table',class_="table")
for detail in details.find_all('tbody'):
link = [up.text for up in detail.find_all("td")]
name=link[0]
wev['Nombre']=name
surname=link[1]
wev["Apellidos"]=surname
tag = soup.find_all("div", class_="col-md-8 col-sm-8")
for pro in tag:
data = [tup.text for tup in pro.find_all("p")]
Dirección = data[2]
Dirección = Dirección[12:]
wev[" Dirección"]= Dirección
Población = data[3]
Población = Población[14:]
wev[" Población"]= Población
Provincia = data[4]
Provincia = Provincia[14:]
wev["Provincia "]=Provincia
Teléfono = data[5]
Teléfono = "+" + Teléfono[11:].replace(".", "")
Teléfono= Teléfono.replace("-", '')
wev[" Teléfono"]= Teléfono
Email = data[6]
Email = Email[10:]
wev["Email"]= Email
temp.append(wev)
df = pd.DataFrame(temp)
print(df)
They will print same name and surname in each entry how I correct it these is output
Nombre Apellidos
0 JUAN ARIAS BARTOLOMÉ
1 JUAN ARIAS BARTOLOM
One approach would be to merge the separate name and surname details into the data from the about information. A test could also be added for when the last page is reached:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from unicodedata import normalize
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
page = 1
data1 = []
data2 = []
while True:
print(f"Page {page}")
r = requests.get(f"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}", headers=headers)
page += 1
soup = BeautifulSoup(r.content, "lxml")
for pro in soup.find_all("div", class_="col-md-8 col-sm-8"):
values = [re.sub(r'\s+', ' ', normalize('NFKD', p.get_text(strip=True))) for p in pro.find_all("p")]
row = {'Sobre' : values[0][6:]} # skip over the word Sobre
for item in values[2:]:
key, value = item.split(':', 1)
row[key.strip()] = value.strip()
row['Teléfono'] = row['Teléfono'].replace(".", "")
data1.append(row)
details = soup.find("table", class_="table").tbody
for tr in details.find_all("tr"):
data2.append([re.sub(r'\s+', ' ', normalize('NFKD', td.get_text(strip=True))) for td in tr.find_all("td")[:-1]])
# Any more?
ul = soup.find("ul", class_="pagination")
last_li = ul.find_all("li")[-1]
if last_li.text != "»":
break
# Merge the name and surname from the second table
data = []
for d1, d2 in zip(data1, data2):
data.append({'Nombre' : d2[0], 'Apellidos' : d2[1]} | d1)
df = pd.DataFrame(data)
print(df)
Giving you a dataframe starting:
Nombre Apellidos Sobre Dirección Población Provincia Teléfono E-mail Web
0 JUAN MARIANO MERCADO Juan Mariano Mercado Juan de Toledo, no 16, 1o B 30800 LORCA Murcia 968-471716 periagomer#hotmail.com
1 Ma. BELEN ABAD GARCIA Ma. Belen Abad Garcia Calle Constantino 33, 1o N 4700 EL EJIDO Almería 950487533 - 647936929 mariabelenabadgarcia#hotmail.com
2 JESÚS ABAD MUÑIZ Jesús Abad Muñiz Santiago, 15, 1o.- ctro. 47001 Valladolid 98.320.20.11 jabad#carlosgallegoabogados.es
3 Ma PALOMA ABAD TEJERINA Ma Paloma Abad Tejerina Poniente, 40 28036 Madrid 91.383.11.45 paloma#abadsaezabogados.com
4 GEMA ÁBALOS MUÑOZ Gema ábalos Muñoz Solarillo de Gracia, 4, 1o.- D 18002 Granada 639.317.297 3004#icagr.es
You could then use Pandas to make any further changes to the data structure. Note, the Python dictionary merge operation requires Python 3.9 onwards

Scrape URL loop with BeautifulSoup

I want to scrap information on different pages of the same site, societe.com and I have several questions.
first of all here is the code that I managed to do, I am a bit of a novice I admit it
I only put 2 URLs to see if the loop worked and some information, I can add some when everything works
urls = ["https://www.societe.com/societe/decathlon-france-500569405.html","https://www.societe.com/societe/go-sport-312193899.html"]
for url in urls:
response = requests.get(url, headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'})
soup = BeautifulSoup(response.text, "html.parser")
numrcs = soup.find("td", class_="numdisplay")
nomcommercial = soup.find("td", class_="break-word")
print(nomcommercial.text)
print(numrcs.text.strip())
numsiret = soup.select('div[id^=siret_number]')
for div in numsiret:
print(div.text.strip())
formejuri = soup.select('div[id^=catjur-histo-description]')
for div in formejuri:
print(div.text.strip())
infosend = {
'numrcs': numrcs,
'nomcommercial':nomcommercial,
'numsiret':numsiret,
'formejuri':formejuri
}
tableau.append(infosend)
print(tableau)
my_infos = ['Numéro RCS', 'Numéro Siret ','Forme Juridique']
my_columns = [
np.tile(np.array(my_infos), len(nomcommercial))
]
df = pd.DataFrame( tableau,index=nomcommercial, columns=my_columns)
df
When I run the loop I have the right information coming out, like for example
DECATHLON FRANCE
Lille Metropole B 500569405
50056940503239
SASU Société par actions simplifiée à associé unique
but I would like to put all this information in a table but I can't really, only the last company appears and the data makes no sense I tried to follow a tutorial without success.
if you can help me i would be really happy
To get data about the companies you can use next example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
urls = [
"https://www.societe.com/societe/decathlon-france-500569405.html",
"https://www.societe.com/societe/go-sport-312193899.html",
]
headers = {
"User-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
}
data = []
for url in urls:
soup = BeautifulSoup(
requests.get(url, headers=headers).content, "html.parser"
)
title = soup.select_one("#identite_deno").get_text(strip=True)
rcs = soup.select_one('td:-soup-contains("Numéro RCS") + td').get_text(
strip=True
)
siret_number = soup.select_one("#siret_number").get_text(strip=True)
form = soup.select_one("#catjur-histo-description").get_text(strip=True)
data.append([title, url, rcs, siret_number, form])
df = pd.DataFrame(
data,
columns=["Title", "URL", "Numéro RCS", "Numéro Siret", "Forme Juridique"],
)
print(df.to_markdown())
Prints:
Title
URL
Numéro RCS
Numéro Siret
Forme Juridique
0
DECATHLON FRANCE (DECATHLON DIRECTION GENERALE FRANCE)
https://www.societe.com/societe/decathlon-france-500569405.html
Lille Metropole B 500569405
50056940503239
SASU Société par actions simplifiée à associé unique
1
GO SPORT
https://www.societe.com/societe/go-sport-312193899.html
Grenoble B 312193899
31219389900191
Société par actions simplifiée

Pagination not iterating over pages

Want to iterate all pages from this url ""url = "https://www.iata.org/en/about/members/airline-list/"" and dump the results in a .csv file.
How could implementing a piece of code to iterate through the pages be included in the current code below?
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://www.iata.org/en/about/members/airline-list/'
req = Request(url , headers = {
'accept':'*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
data = []
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('table.datatable').prettify())[0])
if soup.select_one('span.pagination-link.is-active + div a[href]'):
url = soup.select_one('span.pagination-link.is-active + div a')['href']
else:
break
df = pd.concat(data)
df.to_csv('airline-list.csv',encoding='utf-8-sig',index=False)
Try this approach:
for i in range(1, 30):
url = f'https://www.iata.org/en/about/members/airline-list/?page={i}&search=&ordering=Alphabetical'
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('table.datatable').prettify())[0])
To get data dynamically, use:
import pandas as pd
import requests
import bs4
url = 'https://www.iata.org/en/about/members/airline-list/?page={page}&search=&ordering=Alphabetical'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# Total number of pages
html = requests.get(url.format(page=1), headers=headers)
soup = bs4.BeautifulSoup(html.text)
pages = int(soup.find_all('a', {'class': 'pagination-link'})[-2].text)
data = []
for page in range(1, pages+1):
html = requests.get(url.format(page=page, headers=headers))
data.append(pd.read_html(html.text)[0])
df = pd.concat(data)
Output:
>>> df
Airline Name IATA Designator 3 digit code ICAO code Country / Territory
0 ABX Air GB 832 ABX United States
1 Aegean Airlines A3 390 AEE Greece
2 Aer Lingus EI 53 EIN Ireland
3 Aero Republica P5 845 RPB Colombia
4 Aeroflot SU 555 AFL Russian Federation
.. ... ... ... ... ...
3 WestJet WS 838 WJA Canada
4 White coloured by you WI 97 WHT Portugal
5 Wideroe WF 701 WIF Norway
6 Xiamen Airlines MF 731 CXA China (People's Republic of)
7 YTO Cargo Airlines YG 860 HYT China (People's Republic of)
[288 rows x 5 columns]

Categories