def req(url) -> BeautifulSoup:
print(url)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
return soup
def get_novels(page, sort, order, status):
books = []
novel_list = req(f"https://www.novelupdates.com/novelslisting/?sort={sort}&order={order}&status={status}&pg={page}")
novels = novel_list.find_all(class_="search_main_box_nu")...
The above code gets the actual content of the page in python version 3.10.2, but in 3.9.12, it gets some bot verification page.
Why is that, and how do I fix it? Please help.
Related
Goal is to get Python / BeautifulSoup to scrape Yahoo Finance and the first/last name of public company owner:
from bs4 import BeautifulSoup
import requests
url = 'https://finance.yahoo.com/quote/GTVI/profile?p=GTVI'
page = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
})
soup = BeautifulSoup(page.text, 'html.parser')
price = soup.find_all("tr", {"class": "C($primaryColor) BdB Bdc($seperatorColor) H(36px)"})
print(soup.select_one("td > span").text)
^-The above single call works perfectly, but I can't get it to loop and print multiple times keeping the useragent of the browser masked. Here is my attempt at it (new to Python keep in mind) Haaalp :)
from bs4 import BeautifulSoup
import requests
url = ['https://finance.yahoo.com/quote/GTVI/profile?p=GTVI',
'https://finance.yahoo.com/quote/RAFA/profile?p=RAFA',
'https://finance.yahoo.com/quote/CYDX/profile?p=CYDX',
'https://finance.yahoo.com/quote/TTHG/profile?p=TTHG']
names = []
for link in url:
w=1
reqs2 = requests.get(link)
page = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
})
soup = BeautifulSoup(page.text, 'html.parser')
for x in soup.find_all("tr", {"class": "C($primaryColor) BdB Bdc($seperatorColor) H(36px)"})
names.append(x.text)
print(names)(soup.select_one("td > span").text)
Check your indents to get your code running and also your requests. Cause expected result from your question is not that clear, this is just a hint how to fix or get a result.
Example
from bs4 import BeautifulSoup
import requests
url = ['https://finance.yahoo.com/quote/GTVI/profile?p=GTVI',
'https://finance.yahoo.com/quote/RAFA/profile?p=RAFA',
'https://finance.yahoo.com/quote/CYDX/profile?p=CYDX',
'https://finance.yahoo.com/quote/TTHG/profile?p=TTHG']
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}
names = []
for link in url:
w=1
page = requests.get(link, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
for x in soup.find_all("tr", {"class": "C($primaryColor) BdB Bdc($seperatorColor) H(36px)"}):
names.append(x.text)
print(soup.select_one("td > span").text)
print(names)
I would need to scrape the content (just titles) from a website. I did it for one page, but I would need to do it for all the pages on the website.
Currently, I am doing as follows:
import bs4, requests
import pandas as pd
import re
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
r = requests.get(website, headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html')
title=soup.find_all('h2')
I know that, when I move to the next page, the url changes as follows:
website/page/2/
website/page/3/
...
website/page/49/
...
I tried to build a recursive function using next_page_url = base_url + next_page_partial but it does not move to the next page.
if soup.find("span", text=re.compile("Next")):
page = "https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina/page/".format(page_num)
page_num +=10 # I should scrape all the pages so maybe this number should be changed as I do not know at the beginning how many pages there are for that section
print(page_num)
else:
break
I followed this question (and answer): Moving to next page for scraping using BeautifulSoup
Please let me know if you need more info. Many thanks
Updated code:
import bs4, requests
import pandas as pd
import re
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page_num=1
website="https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina"
while True:
r = requests.get(website, headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html')
title=soup.find_all('h2')
if soup.find("span", text=re.compile("Next")):
page = f"https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina/page/{page_num}".format(page_num)
page_num +=10
else:
break
If you use f"url/{page_num}" then remove format(page_num).
You can use anything you want below:
page = f"https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina/page/{page_num}"
or
page = "https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina/page/{}".format(page_num)
Good luck!
Final answer will be this:
import bs4, requests
import pandas as pd
import re
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page_num=1
website="https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina"
while True:
r = requests.get(website, headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html')
title=soup.find_all('h2')
if soup.find("span", text=re.compile("Next")):
website = f"https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina/page/{page_num}"
page_num +=1
else:
break
It is my first time using web scraping. When I am using page = requests.get(URL) it works perfectly fine but when I am adding
headers = {"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15'}
page = requests.get(URL, headers=headers)
I am getting an error
title = soup.find(id="productTitle").get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'
What's wrong with that? Should I resign with headers?
I think the page contains non valid HTML and therefore BeatifulSoup is not able to find your element.
Try to prettify the HTML first:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/dp/B07JP9QJ15/ref=dp_cerb_1'
headers = {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15'}
page = requests.get(URL, headers=headers)
pretty = BeautifulSoup(page.text,'html.parser').prettify()
soup = BeautifulSoup(pretty,'html.parser')
print(soup.find(id='productTitle').get_text())
Which returns:
Dell UltraSharp U2719D - LED Monitor - 27"
I want to remove target tr block with text, when i run it i got perfect output but there is a problem i have seen that it scraping <tr><td>Domain</td><td>Last Resolved Date</td></tr> actually i don't want this line in my output so how can i remove it.Code bellow
Got fix
Old Code
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = "https://viewdns.info/reverseip/?host=github.com&t=1"
text = requests.get(url, headers=headers).text
soup = BeautifulSoup(text, 'html.parser')
table = soup.find('table', attrs={'border':'1'})
domain = table.findAll('td', attrs={'align':None})
for line in domain:
print(line.text)
Fixed
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = "https://viewdns.info/reverseip/?host=github.com&t=1"
text = requests.get(url, headers=headers).text
soup = BeautifulSoup(text, 'html.parser')
table = soup.find('table', attrs={'border':'1'})
domain = table.findAll('td', attrs={'align':None})[2:]
for line in domain:
print(line.text)
Try the code.
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = "https://viewdns.info/reverseip/?host=github.com&t=1"
text = requests.get(url, headers=headers).text
soup = BeautifulSoup(text, 'html.parser')
table = soup.find('table', attrs={'border':'1'})
domain = table.findAll('td', attrs={'align':None})[2:]
for line in domain:
print(line.text)
Filter out the first two objects in your domain variable:
domain = table.findAll('td', attrs={'align':None})[2:]
I'm new to coding a webscraper with Python. I've done a few tutorials and now I am trying my first one. A really simple test here that yields the error I noted in the Subject line.
import requests
from bs4 import BeautifulSoup
url = "https://www.autotrader.ca/cars/mercedes-benz/ab/calgary/?rcp=15&rcs=0&srt=3&prx=100&prv=Alberta&loc=T3P%200H2&hprc=True&wcp=True&sts=Used&adtype=Private&showcpo=1&inMarket=advancedSearch"
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
html = requests.get(url,headers={'User-Agent': user_agent})
soup = BeautifulSoup(html, "lxml")
print(soup)
Please help me out with trying out this code. Any help is greatly appreciated!
Use html.text instead of html. It's a good practice to send the headers binded with user-agent inside the get() method.
import requests
from bs4 import BeautifulSoup
url = "https://www.autotrader.ca/cars/mercedes-benz/ab/calgary/?rcp=15&rcs=0&srt=3&prx=100&prv=Alberta&loc=T3P%200H2&hprc=True&wcp=True&sts=Used&adtype=Private&showcpo=1&inMarket=advancedSearch"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text,"lxml")
return soup
Make changes in this line:
soup = BeautifulSoup(html, "lxml")
to
soup = BeautifulSoup(html.content, "lxml")
or
soup = BeautifulSoup(html.text, "lxml")
This returns the HTML structure of the webpage.