I have this code that gets all child URLs within a page.
How do I parse multipe URLs through this code?
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/91.0.4472.114 Safari/537.36'}
source = requests.get("https://www.oddsportal.com/soccer/england/efl-cup/results/", headers=headers)
soup = BeautifulSoup(source.text, 'html.parser')
main_div = soup.find("div", class_="main-menu2 main-menu-gray")
a_tag = main_div.find_all("a")
for i in a_tag:
print(i['href'])
How do I modify it to run for multiple URLs
while my URL list is as:
df:
| | URL |
|----|---------------------------------------------------------------------|
| 0 | https://www.oddsportal.com/soccer/nigeria/npfl-pre-season/results/ |
| 1 | https://www.oddsportal.com/soccer/england/efl-cup/results/ |
| 2 | https://www.oddsportal.com/soccer/europe/guadiana-cup/results/ |
| 3 | https://www.oddsportal.com/soccer/world/kings-cup-thailand/results/ |
| 4 | https://www.oddsportal.com/soccer/poland/division-2-east/results/ |
I tried parsing it this way :
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/91.0.4472.114 Safari/537.36'}
for url in df:
source = requests.get(df['URL'], headers=headers)
soup = BeautifulSoup(source.text, 'html.parser')
main_div = soup.find("div", class_="main-menu2 main-menu-gray")
a_tag = main_div.find_all("a")
for i in a_tag:
print(i['href'])
However I am getting this error:
line 742, in get_adapter
raise InvalidSchema("No connection adapters were found for {!r}".format(url))
How can I modify the same to parse multiple URLs?
change
for url in df:
source = requests.get(df['URL'], headers=headers)
To
for url in df['URL']:
source = requests.get(url, headers=headers)
Related
I'm trying to get all the category, sub category and sub sub category and so on of authors URL from dmoz website using BeautifulSoup.
I'm getting the following output:
# Missing the every 2nd option/URL in first step
/Arts/Literature/Authors/A
/Arts/Literature/Authors/C
/Arts/Literature/Authors/E
/Arts/Literature/Authors/G
/Arts/Literature/Authors/Horror
. . .
# Missing the every 1st option/URL in second step
/Arts/Literature/Authors/A/Abbey,_Lynn
/Top/Arts/Literature/Authors/A/Abe,_Kobo
In the above code 2nd element is missing in 1st step and 1st element in 2nd step.
Here is my code:
scrape_url = "http://dmoz.org/Arts/Literature/Authors"
page = session.get(scrape_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# get all the root category author list
for test in find_row:
if test.find('div', attrs = {'class':'panel-body'}):
test_link = test.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
# now get the sub or sub-sub category author URL list
for cat in sub_cat:
scrape_cat_url = "http://dmoz.org%s" % (cat)
print('scraping...', scrape_cat_url)
page = session.get(scrape_cat_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# if sub category go next level or restart
for row in find_row:
if row.find('div', attrs = {'class':'panel-body'}):
test_link = row.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
records.append(scrape_cat_url)
else:
records.append(scrape_cat_url)
# remove the category url from the sub_cat list
sub_cat.remove(cat)
Can anybody suggest a better way to get all the category, sub category and sub sub category URL of authors?
Try this streamlined version of your code:
from bs4 import BeautifulSoup
headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
scrape_url = "http://dmozlive.com/Top/Arts/Literature/Authors"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
cats = []
for row in find_rows:
links = row.find_all('a')
for link in links:
cats.append(link['href'])
cats
Print out:
['/Top/Arts/Literature/Authors/A',
'/Top/Arts/Literature/Authors/B',
'/Top/Arts/Literature/Authors/C',
'/Top/Arts/Literature/Authors/D',
'/Top/Arts/Literature/Authors/E',
'/Top/Arts/Literature/Authors/F',
…
Now get the subcategories:
sub_cats = []
for cat in cats:
scrape_url = f"http://dmozlive.com{cat}"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
for row in find_rows:
links = row.find_all('a')
for link in links:
sub_cats.append(link['href'])
subcats
Print out:
['/Top/Arts/Literature/Authors/A/Abbey,_Edward',
'/Top/Arts/Literature/Authors/A/Abbey,_Lynn',
'/Top/Arts/Literature/Authors/A/Abbott,_Edwin_A.',
'/Top/Arts/Literature/Authors/A/Abe,_Kobo',
'/Top/Arts/Literature/Authors/A/Achebe,_Chinua',
'/Top/Arts/Literature/Authors/A/Ackroyd,_Peter',
'/Top/Arts/Literature/Authors/A/Adams,_Douglas',
…
The following code may meet your expectation that's pull all the categories and sub-categories urls.
import requests
from bs4 import BeautifulSoup
url= 'http://dmozlive.com/Top/Arts/Literature/Authors'
headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"}
req=requests.get(url,headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
for cat_url in soup.select('.list-group.col-md-6 a'):
cat_url = 'http://dmozlive.com' + cat_url.get('href')
#print(cat_url)
req2=requests.get(cat_url,headers=headers)
soup2 = BeautifulSoup(req2.text, 'html.parser')
for author_url in soup2.select('.list-group-item'):
author_url= 'http://dmozlive.com' + str(author_url.get('href'))
print(author_url)
I'm trying to scrape a website in which I need to send a POST request to a form to query data. Here is the code I'm using.
import requests
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
with requests.Session() as s:
r = s.get('https://data.rabbu.com', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
hidden = soup.find_all("input", {'type':'hidden'})
payload = {x["name"]: x["value"] for x in hidden}
payload['search'] = '16101 Tampa Street, Brooksville FL 34604'
payload['bedrooms'] = '2'
r = s.post('https://data.rabbu.com/e', headers=headers, data=payload)
soup = BeautifulSoup(r.text, 'html.parser')
print(soup.text)
But I'm unable to send properly the POST request because I'm getting the following error message:
"The change you wanted was rejected (422)"
I tried to use the "json" argument instead of "data" - to no avail.
Do you have any idea how I can bypass this issue? Any help would be appreciated.
Your parameters need to be changed. Try the following:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
with requests.Session() as s:
r = s.get('https://data.rabbu.com', headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
hidden = soup.find_all("input", {'type':'hidden'})
payload = {x["name"]: x["value"] for x in hidden}
payload['estimate[address]'] = '16101 Tampa Street, Brooksville FL 34604'
payload['estimate[bedrooms]'] = '2'
r = s.post('https://data.rabbu.com/e', headers=headers, params=payload)
soup = BeautifulSoup(r.content, 'html.parser')
print(soup.title.text)
Giving you:
16101 Tampa St, Brooksville, FL 34604, USA | Revenue Projection: $1,639/mo | 2 to 2bds | 13 comps | Rabbu
My list xfrs, returns a blank DF when I convert it....does anyone see any issues with the code?
I'm able to append and print the list fine, but when I append, the DF transfers is blank.
url2 = 'https://247sports.com/Season/2020-Football/TransferPortalPositionRanking/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
response = requests.get(url2, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
xfrs = []
schools = []
for li in soup.findAll('li', attrs={'class':'transfer-player'}):
xfrs.append(li.find('a').contents)
schools.append(li.find('li', attrs={'class':'destination'}))
transfers = pd.DataFrame(xfrs, columns=['Players'])
print(transfers)
As mentioned, .contents returns a list of BeautifulSoup objects, so you need to use for example .text to get the name. Also take care of your selection it should be more specific.
Storing the scraped data in a dataframe try to collect it as list of dicts:
data.append({
'Player':li.h3.text,
'Destination':destination['alt'] if (destination:=li.select_one('img[class="logo"]')) else None
})
Example
import requests,json
from bs4 import BeautifulSoup as bs
url2 = 'https://247sports.com/Season/2020-Football/TransferPortalPositionRanking/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
response = requests.get(url2, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
data = []
for li in soup.find_all('li', attrs={'class':'transfer-player'}):
data.append({
'Player':li.h3.text,
'Destination':destination['alt'] if (destination:=li.select_one('img[class="logo"]')) else None
})
pd.DataFrame(data)
Output
Player
Destination
JT Daniels
Georgia
KJ Costello
Mississippi State
Jamie Newman
Georgia
...
...
This code scrapes amazon for a product name. I wanted to strip this variable, which contains HTML of its whitespace,
span = soup.find("span", id="productTitle")
print(span.strip())
but it gives me this error;
Traceback (most recent call last):
File "C:/Users/avensis/Desktop/Projects/AmazonScraper/Scraper.py", line 17, in <module>
print(span.strip())
TypeError: 'NoneType' object is not callable
I don't understand why this occurs. Can someone please explain? Here is my full code:
from bs4 import BeautifulSoup
import requests
import html5lib
url = 'https://www.amazon.co.uk/Pingu-PING2573-Mug/dp/B0764468MD/ref=sr_1_11?dchild=1&keywords=pingu&qid=1595849018' \
'&sr=8-11 '
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/84.0.4147.89 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html5lib')
span = soup.find("span", id="productTitle")
print(span.strip())
I guess this is what you want to do:
from bs4 import BeautifulSoup
import requests
import html5lib
import random
url = 'https://www.amazon.co.uk/Pingu-PING2573-Mug/dp/B0764468MD/ref=sr_1_11?dchild=1&keywords=pingu&qid=1595849018' \
'&sr=8-11 '
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/84.0.4147.89 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html5lib')
span = soup.find("span", id="productTitle")
print(span.get_text(strip=True))
prints:
Pingu - Mug | 300 ml | Ceramic | Gift Box | 11 x 8.5 x 8.5 cm
If it is what you looking for it was the .get_text(strip=True) you missed
Use .get_text() method:
span.get_text().replace("\n", "")
'Pingu - Mug | 300 ml | Ceramic | Gift Box | 11 x 8.5 x 8.5 cm'
Any idea how can i retrieve the price (now 2917.99) from this source code view-https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/
If I call the class p.product-new-price i get None.
I have managed to get the title, but not the price.
What I have done so far:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get(URL, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
div = soup.find('div', {"class" : 'product-new-price'})
text = div.string
print(text)
The class looks like below and I want to extract the 2917 as int.
div class="product-highlight product-page-pricing"
p class="product-new-price"
2.917<sup>99</sup> <span>Lei</span>
Thank you very much!
Ok, with minor modifications:
It seems that the class product-new-price is on the p element for me!
I am assuming there will always be a <sup> tag after the main price
import requests
from bs4 import BeautifulSoup
URL = 'https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get(URL, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
p = soup.find('p', {"class" : 'product-new-price'})
# Get the text before <sup> tag
value = p.find('sup').previousSibling.strip()
print("Value: {}".format(value))
# Keep only numbers
value = ''.join(c for c in value if c.isdigit())
price = int(value)
print("Price: {}".format(price))
The above prints:
$ python3 ./test.py
Value: 2.917
Price: 2917
Now, with small changes you can also add the missing .99 if this is required