Python - web scraping - pagination stuck at page 1, doesn't progress further - python

new to python, wrote a following code:
import bs4
from urllib.request import urlopen as Open
from urllib.request import Request
from bs4 import BeautifulSoup as soup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
results = "https://www.otodom.pl/sprzedaz/mieszkanie/?nrAdsPerPage=72&search%5Border%5D=created_at_first%3Adesc&page=1"
req = Request(url=results, headers=headers)
html = Open(req).read()
page_soup = soup(html, "html.parser")
total_pages = int(page_soup.find("div",{"class":"after-offers clearfix"}).find("ul",{"class":"pager"}).findAll("li")[4].text)
page_number = 0
if page_number < total_pages:
page_number = page_number + 1
results = "https://www.otodom.pl/sprzedaz/mieszkanie/?nrAdsPerPage=72&search%5Border%5D=created_at_first%3Adesc&page="+str(page_number)
print(results)
req = Request(url=results, headers=headers)
html = Open(req).read()
page_soup = soup(html, "html.parser")
listings = page_soup.findAll("article",{"data-featured-name":"listing_no_promo"})
print(len(listings))
I would have expected the end result to be a stream of printed out links, and number of listings on the page, yet all I have is:
https://www.otodom.pl/sprzedaz/mieszkanie/?nrAdsPerPage=72&search%5Border%5D=created_at_first%3Adesc&page=1
72
Any help would be appreciated, many thanks in advance!

In your script you don't have any loop to get page_soup from new page.
This script scrapes the total number of pages and then iterates over them, prints names of offers and it's link:
import requests
from bs4 import BeautifulSoup as soup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
results = "https://www.otodom.pl/sprzedaz/mieszkanie/?nrAdsPerPage=72&search%5Border%5D=created_at_first%3Adesc&page={}"
with requests.session() as s:
req = s.get(results.format(1), headers=headers)
page_soup = soup(req.text, "html.parser")
total_pages = int(page_soup.find("div",{"class":"after-offers clearfix"}).find("ul",{"class":"pager"}).findAll("li")[4].text)
print(total_pages)
cnt = 1
for page in range(1, 10): # <--- change 10 to total_pages to scrape all pages
req = s.get(results.format(page), headers=headers)
page_soup = soup(req.text, "html.parser")
for a in page_soup.select('h3 a[data-featured-name="listing_no_promo"]'):
name, link = a.find_next('span', {'class':'offer-item-title'}).text, a['href']
print('{:<4} {:<50} {}'.format(cnt, name, link))
cnt += 1
Prints:
1645
1 Biuro Sprzedaży Mieszkań 2 Pokoje Bezpośrednio https://www.otodom.pl/oferta/biuro-sprzedazy-mieszkan-2-pokoje-bezposrednio-ID43LEw.html#b3d6f6add3
2 Przestronne mieszkanie na nowej inwestycji - 2020 https://www.otodom.pl/oferta/przestronne-mieszkanie-na-nowej-inwestycji-2020-ID43LEt.html#b3d6f6add3
3 Kapitalny remont, Grabiszyńska, parking https://www.otodom.pl/oferta/kapitalny-remont-grabiszynska-parking-ID43LE0.html#b3d6f6add3
4 Przestronne mieszkanie przy ulicy Żurawiej https://www.otodom.pl/oferta/przestronne-mieszkanie-przy-ulicy-zurawiej-ID43LDZ.html#b3d6f6add3
5 Katowice Bezpośrednio 3 Pokoje https://www.otodom.pl/oferta/katowice-bezposrednio-3-pokoje-ID43LDX.html#b3d6f6add3
6 2 Pokojowe mieszkanie na osiedlu zamkniętym Łomian https://www.otodom.pl/oferta/2-pokojowe-mieszkanie-na-osiedlu-zamknietym-lomian-ID43LDV.html#b3d6f6add3
7 Słoneczne 3 pokojowe w doskonałej lokalizacji ! https://www.otodom.pl/oferta/sloneczne-3-pokojowe-w-doskonalej-lokalizacji-ID43LDS.html#b3d6f6add3
8 Inteligenty apartament Zajezdnia Wrzeszcz https://www.otodom.pl/oferta/inteligenty-apartament-zajezdnia-wrzeszcz-ID43LDR.html#b3d6f6add3
9 Mieszkanie, 32,04 m², Szczecin https://www.otodom.pl/oferta/mieszkanie-32-04-m-szczecin-ID43LDN.html#b3d6f6add3
10 M-3 Teofilów Na Sprzedaż https://www.otodom.pl/oferta/m-3-teofilow-na-sprzedaz-ID43LDI.html#b3d6f6add3
11 2-Pokojowe Mieszkanie https://www.otodom.pl/oferta/2-pokojowe-mieszkanie-ID43LDH.html#b3d6f6add3
12 2 duże pokoje w centrum Gdańsk ul. Zakopiańska https://www.otodom.pl/oferta/2-duze-pokoje-w-centrum-gdansk-ul-zakopianska-ID43LDE.html#b3d6f6add3
13 M2 na Zabobrzu III https://www.otodom.pl/oferta/m2-na-zabobrzu-iii-ID43LDx.html#b3d6f6add3
14 Mieszkanie 2 pokojowe ,atrakcyjna cena https://www.otodom.pl/oferta/mieszkanie-2-pokojowe-atrakcyjna-cena-ID43LDv.html#b3d6f6add3
15 M2 Centrum Miasta, I piętro https://www.otodom.pl/oferta/m2-centrum-miasta-i-pietro-ID43LDt.html#b3d6f6add3
16 Rodzinny 3 Pokojowy Apartament z Ogródkiem https://www.otodom.pl/oferta/rodzinny-3-pokojowy-apartament-z-ogrodkiem-ID43LDr.html#b3d6f6add3
17 2 pokoje. Aneks kuchenny. 45,5 m. Balkon https://www.otodom.pl/oferta/2-pokoje-aneks-kuchenny-45-5-m-balkon-ID43LDp.html#b3d6f6add3
... and so on.

Related

Send POST request in Python

I'm trying to scrape a website in which I need to send a POST request to a form to query data. Here is the code I'm using.
import requests
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
with requests.Session() as s:
r = s.get('https://data.rabbu.com', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
hidden = soup.find_all("input", {'type':'hidden'})
payload = {x["name"]: x["value"] for x in hidden}
payload['search'] = '16101 Tampa Street, Brooksville FL 34604'
payload['bedrooms'] = '2'
r = s.post('https://data.rabbu.com/e', headers=headers, data=payload)
soup = BeautifulSoup(r.text, 'html.parser')
print(soup.text)
But I'm unable to send properly the POST request because I'm getting the following error message:
"The change you wanted was rejected (422)"
I tried to use the "json" argument instead of "data" - to no avail.
Do you have any idea how I can bypass this issue? Any help would be appreciated.
Your parameters need to be changed. Try the following:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
with requests.Session() as s:
r = s.get('https://data.rabbu.com', headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
hidden = soup.find_all("input", {'type':'hidden'})
payload = {x["name"]: x["value"] for x in hidden}
payload['estimate[address]'] = '16101 Tampa Street, Brooksville FL 34604'
payload['estimate[bedrooms]'] = '2'
r = s.post('https://data.rabbu.com/e', headers=headers, params=payload)
soup = BeautifulSoup(r.content, 'html.parser')
print(soup.title.text)
Giving you:
16101 Tampa St, Brooksville, FL 34604, USA | Revenue Projection: $1,639/mo | 2 to 2bds | 13 comps | Rabbu

Pagination not iterating over pages

Want to iterate all pages from this url ""url = "https://www.iata.org/en/about/members/airline-list/"" and dump the results in a .csv file.
How could implementing a piece of code to iterate through the pages be included in the current code below?
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://www.iata.org/en/about/members/airline-list/'
req = Request(url , headers = {
'accept':'*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
data = []
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('table.datatable').prettify())[0])
if soup.select_one('span.pagination-link.is-active + div a[href]'):
url = soup.select_one('span.pagination-link.is-active + div a')['href']
else:
break
df = pd.concat(data)
df.to_csv('airline-list.csv',encoding='utf-8-sig',index=False)
Try this approach:
for i in range(1, 30):
url = f'https://www.iata.org/en/about/members/airline-list/?page={i}&search=&ordering=Alphabetical'
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('table.datatable').prettify())[0])
To get data dynamically, use:
import pandas as pd
import requests
import bs4
url = 'https://www.iata.org/en/about/members/airline-list/?page={page}&search=&ordering=Alphabetical'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# Total number of pages
html = requests.get(url.format(page=1), headers=headers)
soup = bs4.BeautifulSoup(html.text)
pages = int(soup.find_all('a', {'class': 'pagination-link'})[-2].text)
data = []
for page in range(1, pages+1):
html = requests.get(url.format(page=page, headers=headers))
data.append(pd.read_html(html.text)[0])
df = pd.concat(data)
Output:
>>> df
Airline Name IATA Designator 3 digit code ICAO code Country / Territory
0 ABX Air GB 832 ABX United States
1 Aegean Airlines A3 390 AEE Greece
2 Aer Lingus EI 53 EIN Ireland
3 Aero Republica P5 845 RPB Colombia
4 Aeroflot SU 555 AFL Russian Federation
.. ... ... ... ... ...
3 WestJet WS 838 WJA Canada
4 White coloured by you WI 97 WHT Portugal
5 Wideroe WF 701 WIF Norway
6 Xiamen Airlines MF 731 CXA China (People's Republic of)
7 YTO Cargo Airlines YG 860 HYT China (People's Republic of)
[288 rows x 5 columns]

Python BeautifulSoup - extract URLs and request pages and then retrieve abstracts

I want to access the E-journal page and then retrieve every abstract of the articles.
So I wrote the code that makes a list of the URLs of abstract pages. And it works successfully.
But when I tried to request the URLs and retrieve the abstracts, it didn't work. (with many 'None' in the console.)
This is my code.
import requests
from bs4 import BeautifulSoup
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
URL = "https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7"
JAGS_result = requests.get(URL, headers=h)
JAGS_soup = BeautifulSoup(JAGS_result.text, "html.parser")
L = []
for link in JAGS_soup.find_all('a',{"title":"Abstract"}):
L.append(link.get('href'))
Ab_Links = []
a = 0
for ab_link in L:
if a == len(L):
break
else:
full_link = "https://agsjournals.onlinelibrary.wiley.com"+L[a]
Ab_Links.append(full_link)
a = a+1
print(Ab_Links)
b = 0
Ab = []
Ab_URL = Ab_Links[b]
for ab_url in Ab_Links:
if b == len(L):
break
else:
Ab_result = requests.get(Ab_Links[b], headers = h)
Ab_soup = BeautifulSoup(Ab_result.text, "html.parser")
abstract = Ab_soup.find({"class" : "article-section article-section__abstract"})
Ab.append(abstract)
b = b+1
print(Ab)
I am a novice to python and HTML so it is very hard to write code by myself. Please help me...
import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url)
soup = BeautifulSoup(
r.content, 'lxml', parse_only=SoupStrainer('a', title='Abstract'))
links = [urljoin(url, x['href']) for x in soup.select('a')]
for link in links:
r = req.get(link)
soup = BeautifulSoup(r.text, 'lxml')
print(soup.select_one('.article-section.article-section__abstract'))
if __name__ == "__main__":
main('https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7')
You could try this out.
This prints the abstract of all the articles in the page.
import requests
import bs4 as bs
url = 'https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7'
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
resp = requests.get(url, headers=h)
soup = bs.BeautifulSoup(resp.text, 'lxml')
base_url = 'https://agsjournals.onlinelibrary.wiley.com'
abstract_urls = soup.findAll('a', attrs= {'title': 'Abstract'})
for i in abstract_urls:
a_url = base_url + i['href']
r = requests.get(a_url,headers=h)
soup = bs.BeautifulSoup(r.text, 'lxml')
abs_text = soup.find('section', class_='article-section article-section__full').text.strip()
print(abs_text)
Your code is mostly correct. The problem is with finding the abstract. In order to search for an element by class, use class_='...'. If you change your Abstract = line to the following, it will return results:
abstract = Ab_soup.find(class_='article-section article-section__abstract')
Also, you can simplify your loops. for ab_link in L will iterate through each item in L and then stop. You do not need to test if a == len(L), and in fact that code will never be True, because the loop will exit before a == len(L).

Python .strip() function gives error on variable with HTML (BeautifulSoup)

This code scrapes amazon for a product name. I wanted to strip this variable, which contains HTML of its whitespace,
span = soup.find("span", id="productTitle")
print(span.strip())
but it gives me this error;
Traceback (most recent call last):
File "C:/Users/avensis/Desktop/Projects/AmazonScraper/Scraper.py", line 17, in <module>
print(span.strip())
TypeError: 'NoneType' object is not callable
I don't understand why this occurs. Can someone please explain? Here is my full code:
from bs4 import BeautifulSoup
import requests
import html5lib
url = 'https://www.amazon.co.uk/Pingu-PING2573-Mug/dp/B0764468MD/ref=sr_1_11?dchild=1&keywords=pingu&qid=1595849018' \
'&sr=8-11 '
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/84.0.4147.89 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html5lib')
span = soup.find("span", id="productTitle")
print(span.strip())
I guess this is what you want to do:
from bs4 import BeautifulSoup
import requests
import html5lib
import random
url = 'https://www.amazon.co.uk/Pingu-PING2573-Mug/dp/B0764468MD/ref=sr_1_11?dchild=1&keywords=pingu&qid=1595849018' \
'&sr=8-11 '
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/84.0.4147.89 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html5lib')
span = soup.find("span", id="productTitle")
print(span.get_text(strip=True))
prints:
Pingu - Mug | 300 ml | Ceramic | Gift Box | 11 x 8.5 x 8.5 cm
If it is what you looking for it was the .get_text(strip=True) you missed
Use .get_text() method:
span.get_text().replace("\n", "")
'Pingu - Mug | 300 ml | Ceramic | Gift Box | 11 x 8.5 x 8.5 cm'

Scraping Table With Python/BS4

Im trying to scrape the "Team Stats" table from http://www.pro-football-reference.com/boxscores/201602070den.htm with BS4 and Python 2.7. However Im unable to get anywhere close to it,
url = 'http://www.pro-football-reference.com/boxscores/201602070den.htm'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html5lib")
table=soup.findAll('table', {'id':"team_stats", "class":"stats_table"})
print table
I thought something like the above code would work but no luck.
The problem in this case is that the "Team Stats" table is located inside a comment in the HTML source which you download with requests. Locate the comment and reparse it with BeautifulSoup into a "soup" object:
import requests
from bs4 import BeautifulSoup, NavigableString
url = 'http://www.pro-football-reference.com/boxscores/201602070den.htm'
page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
soup = BeautifulSoup(page.content, "html5lib")
comment = soup.find(text=lambda x: isinstance(x, NavigableString) and "team_stats" in x)
soup = BeautifulSoup(comment, "html5lib")
table = soup.find("table", id="team_stats")
print(table)
And/or, you can load the table into, for example, a pandas dataframe which is very convenient to work with:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from bs4 import NavigableString
url = 'http://www.pro-football-reference.com/boxscores/201602070den.htm'
page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
soup = BeautifulSoup(page.content, "html5lib")
comment = soup.find(text=lambda x: isinstance(x, NavigableString) and "team_stats" in x)
df = pd.read_html(comment)[0]
print(df)
Prints:
Unnamed: 0 DEN CAR
0 First Downs 11 21
1 Rush-Yds-TDs 28-90-1 27-118-1
2 Cmp-Att-Yd-TD-INT 13-23-141-0-1 18-41-265-0-1
3 Sacked-Yards 5-37 7-68
4 Net Pass Yards 104 197
5 Total Yards 194 315
6 Fumbles-Lost 3-1 4-3
7 Turnovers 2 4
8 Penalties-Yards 6-51 12-102
9 Third Down Conv. 1-14 3-15
10 Fourth Down Conv. 0-0 0-0
11 Time of Possession 27:13 32:47

Categories