Send POST request in Python - python

I'm trying to scrape a website in which I need to send a POST request to a form to query data. Here is the code I'm using.
import requests
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
with requests.Session() as s:
r = s.get('https://data.rabbu.com', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
hidden = soup.find_all("input", {'type':'hidden'})
payload = {x["name"]: x["value"] for x in hidden}
payload['search'] = '16101 Tampa Street, Brooksville FL 34604'
payload['bedrooms'] = '2'
r = s.post('https://data.rabbu.com/e', headers=headers, data=payload)
soup = BeautifulSoup(r.text, 'html.parser')
print(soup.text)
But I'm unable to send properly the POST request because I'm getting the following error message:
"The change you wanted was rejected (422)"
I tried to use the "json" argument instead of "data" - to no avail.
Do you have any idea how I can bypass this issue? Any help would be appreciated.

Your parameters need to be changed. Try the following:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
with requests.Session() as s:
r = s.get('https://data.rabbu.com', headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
hidden = soup.find_all("input", {'type':'hidden'})
payload = {x["name"]: x["value"] for x in hidden}
payload['estimate[address]'] = '16101 Tampa Street, Brooksville FL 34604'
payload['estimate[bedrooms]'] = '2'
r = s.post('https://data.rabbu.com/e', headers=headers, params=payload)
soup = BeautifulSoup(r.content, 'html.parser')
print(soup.title.text)
Giving you:
16101 Tampa St, Brooksville, FL 34604, USA | Revenue Projection: $1,639/mo | 2 to 2bds | 13 comps | Rabbu

Related

What is the fix for this Error: 'NoneType' object has no attribute 'prettify'

I want to scrape this URL https://aviation-safety.net/wikibase/type/C206.
I don't understand the meaning of this error below:
'NoneType' object has no attribute 'prettify'
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/type/C206'
req = Request(url , headers = {
'accept':'*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
data = []
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('tbody').prettify())[0])
if soup.select_one('div.pagenumbers + div a[href]'):
url = soup.select_one('div.pagenumbers + div a')['href']
else:
break
df = pd.concat(data)
df.to_csv('206.csv',encoding='utf-8-sig',index=False)
You're not using headers with requests, which is the reason you're not getting the right HTML and the table you're after is the second one, not the first. Also, I'd highly recommend to use requests over urllib.request.
So, having said that, here's how to get all the tables from all the pages:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://aviation-safety.net/wikibase/type/C206'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
}
data = []
with requests.Session() as s:
total_pages = int(
BeautifulSoup(s.get(url, headers=headers).text, "lxml")
.select("div.pagenumbers > a")[-1]
.getText()
)
for page in range(1, total_pages + 1):
print(f"Getting page: {page}...")
data.append(
pd.read_html(
s.get(f"{url}/{page}", headers=headers).text,
flavor="lxml",
)[1]
)
df = pd.concat(data)
df.to_csv('206.csv', sep=";", index=False)

List Converts to Blank Dataframe

My list xfrs, returns a blank DF when I convert it....does anyone see any issues with the code?
I'm able to append and print the list fine, but when I append, the DF transfers is blank.
url2 = 'https://247sports.com/Season/2020-Football/TransferPortalPositionRanking/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
response = requests.get(url2, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
xfrs = []
schools = []
for li in soup.findAll('li', attrs={'class':'transfer-player'}):
xfrs.append(li.find('a').contents)
schools.append(li.find('li', attrs={'class':'destination'}))
transfers = pd.DataFrame(xfrs, columns=['Players'])
print(transfers)
As mentioned, .contents returns a list of BeautifulSoup objects, so you need to use for example .text to get the name. Also take care of your selection it should be more specific.
Storing the scraped data in a dataframe try to collect it as list of dicts:
data.append({
'Player':li.h3.text,
'Destination':destination['alt'] if (destination:=li.select_one('img[class="logo"]')) else None
})
Example
import requests,json
from bs4 import BeautifulSoup as bs
url2 = 'https://247sports.com/Season/2020-Football/TransferPortalPositionRanking/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
response = requests.get(url2, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
data = []
for li in soup.find_all('li', attrs={'class':'transfer-player'}):
data.append({
'Player':li.h3.text,
'Destination':destination['alt'] if (destination:=li.select_one('img[class="logo"]')) else None
})
pd.DataFrame(data)
Output
Player
Destination
JT Daniels
Georgia
KJ Costello
Mississippi State
Jamie Newman
Georgia
...
...

Python BeautifulSoup - extract URLs and request pages and then retrieve abstracts

I want to access the E-journal page and then retrieve every abstract of the articles.
So I wrote the code that makes a list of the URLs of abstract pages. And it works successfully.
But when I tried to request the URLs and retrieve the abstracts, it didn't work. (with many 'None' in the console.)
This is my code.
import requests
from bs4 import BeautifulSoup
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
URL = "https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7"
JAGS_result = requests.get(URL, headers=h)
JAGS_soup = BeautifulSoup(JAGS_result.text, "html.parser")
L = []
for link in JAGS_soup.find_all('a',{"title":"Abstract"}):
L.append(link.get('href'))
Ab_Links = []
a = 0
for ab_link in L:
if a == len(L):
break
else:
full_link = "https://agsjournals.onlinelibrary.wiley.com"+L[a]
Ab_Links.append(full_link)
a = a+1
print(Ab_Links)
b = 0
Ab = []
Ab_URL = Ab_Links[b]
for ab_url in Ab_Links:
if b == len(L):
break
else:
Ab_result = requests.get(Ab_Links[b], headers = h)
Ab_soup = BeautifulSoup(Ab_result.text, "html.parser")
abstract = Ab_soup.find({"class" : "article-section article-section__abstract"})
Ab.append(abstract)
b = b+1
print(Ab)
I am a novice to python and HTML so it is very hard to write code by myself. Please help me...
import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url)
soup = BeautifulSoup(
r.content, 'lxml', parse_only=SoupStrainer('a', title='Abstract'))
links = [urljoin(url, x['href']) for x in soup.select('a')]
for link in links:
r = req.get(link)
soup = BeautifulSoup(r.text, 'lxml')
print(soup.select_one('.article-section.article-section__abstract'))
if __name__ == "__main__":
main('https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7')
You could try this out.
This prints the abstract of all the articles in the page.
import requests
import bs4 as bs
url = 'https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7'
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
resp = requests.get(url, headers=h)
soup = bs.BeautifulSoup(resp.text, 'lxml')
base_url = 'https://agsjournals.onlinelibrary.wiley.com'
abstract_urls = soup.findAll('a', attrs= {'title': 'Abstract'})
for i in abstract_urls:
a_url = base_url + i['href']
r = requests.get(a_url,headers=h)
soup = bs.BeautifulSoup(r.text, 'lxml')
abs_text = soup.find('section', class_='article-section article-section__full').text.strip()
print(abs_text)
Your code is mostly correct. The problem is with finding the abstract. In order to search for an element by class, use class_='...'. If you change your Abstract = line to the following, it will return results:
abstract = Ab_soup.find(class_='article-section article-section__abstract')
Also, you can simplify your loops. for ab_link in L will iterate through each item in L and then stop. You do not need to test if a == len(L), and in fact that code will never be True, because the loop will exit before a == len(L).

How do I modify code to parse multiple URL?

I have this code that gets all child URLs within a page.
How do I parse multipe URLs through this code?
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/91.0.4472.114 Safari/537.36'}
source = requests.get("https://www.oddsportal.com/soccer/england/efl-cup/results/", headers=headers)
soup = BeautifulSoup(source.text, 'html.parser')
main_div = soup.find("div", class_="main-menu2 main-menu-gray")
a_tag = main_div.find_all("a")
for i in a_tag:
print(i['href'])
How do I modify it to run for multiple URLs
while my URL list is as:
df:
| | URL |
|----|---------------------------------------------------------------------|
| 0 | https://www.oddsportal.com/soccer/nigeria/npfl-pre-season/results/ |
| 1 | https://www.oddsportal.com/soccer/england/efl-cup/results/ |
| 2 | https://www.oddsportal.com/soccer/europe/guadiana-cup/results/ |
| 3 | https://www.oddsportal.com/soccer/world/kings-cup-thailand/results/ |
| 4 | https://www.oddsportal.com/soccer/poland/division-2-east/results/ |
I tried parsing it this way :
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/91.0.4472.114 Safari/537.36'}
for url in df:
source = requests.get(df['URL'], headers=headers)
soup = BeautifulSoup(source.text, 'html.parser')
main_div = soup.find("div", class_="main-menu2 main-menu-gray")
a_tag = main_div.find_all("a")
for i in a_tag:
print(i['href'])
However I am getting this error:
line 742, in get_adapter
raise InvalidSchema("No connection adapters were found for {!r}".format(url))
How can I modify the same to parse multiple URLs?
change
for url in df:
source = requests.get(df['URL'], headers=headers)
To
for url in df['URL']:
source = requests.get(url, headers=headers)

Python .strip() function gives error on variable with HTML (BeautifulSoup)

This code scrapes amazon for a product name. I wanted to strip this variable, which contains HTML of its whitespace,
span = soup.find("span", id="productTitle")
print(span.strip())
but it gives me this error;
Traceback (most recent call last):
File "C:/Users/avensis/Desktop/Projects/AmazonScraper/Scraper.py", line 17, in <module>
print(span.strip())
TypeError: 'NoneType' object is not callable
I don't understand why this occurs. Can someone please explain? Here is my full code:
from bs4 import BeautifulSoup
import requests
import html5lib
url = 'https://www.amazon.co.uk/Pingu-PING2573-Mug/dp/B0764468MD/ref=sr_1_11?dchild=1&keywords=pingu&qid=1595849018' \
'&sr=8-11 '
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/84.0.4147.89 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html5lib')
span = soup.find("span", id="productTitle")
print(span.strip())
I guess this is what you want to do:
from bs4 import BeautifulSoup
import requests
import html5lib
import random
url = 'https://www.amazon.co.uk/Pingu-PING2573-Mug/dp/B0764468MD/ref=sr_1_11?dchild=1&keywords=pingu&qid=1595849018' \
'&sr=8-11 '
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/84.0.4147.89 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html5lib')
span = soup.find("span", id="productTitle")
print(span.get_text(strip=True))
prints:
Pingu - Mug | 300 ml | Ceramic | Gift Box | 11 x 8.5 x 8.5 cm
If it is what you looking for it was the .get_text(strip=True) you missed
Use .get_text() method:
span.get_text().replace("\n", "")
'Pingu - Mug | 300 ml | Ceramic | Gift Box | 11 x 8.5 x 8.5 cm'

Categories