I am working on a project to find the latest 10K filings url for a company using CIK number. Please find the code below:
import requests
from bs4 import BeautifulSoup
# CIK number for Apple is 0001166559
cik_number = "0001166559"
url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik_number}&type=10-K&dateb=&owner=exclude&count=40"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Find the link to the latest 10-K filing
link = soup.find('a', {'id': 'documentsbutton'})
filing_url = link['href']
print(filing_url)
I am getting HTTP 403 error. Please help me
Thanks
I was able to get a 200 response by reusing your same snippet. You may have missed to add the headers:
import requests
from bs4 import BeautifulSoup
# CIK number for Apple is 0001166559
cik_number = "0001166559"
url = f'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik_number}&type=10-K&dateb=&owner=exclude&count=40'
# add this
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
response = requests.get(url, headers=headers)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)
Output:
NOTE:
You can read more on why do we need to add User-Agent in our headers from here. Basically what you need to do is to make sure that the request looks like that it's coming from a browser, so just add an the extra header parameter:
Related
from bs4 import BeautifulSoup
import requests
url = "https://www.gamerdvr.com/gamer/cookz/videos"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
element = soup.find('span', id_="most-recorded")
print(element)
This always prints "none" but when I go to the website, I can see it. I even deleted all cookies and it's still there.
Without specifying a user agent, the site does not give you the tag you need.
from bs4 import BeautifulSoup
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
url = "https://www.gamerdvr.com/gamer/cookz/videos"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
element = soup.find('span', {'id': "most-recorded"}).get_text(strip=True)
print(element)
OUTPUT:
Fortnite
After watching a video I tried to fetch price for an item from a amazon.de website using BeautifulSoup api.
#My CODE
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.de/Neues-Apple-iPhone-Pro-128-GB/dp/B08L5SNWD2/ref=sr_1_1_sspa?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&crid=3UH87RWLLO40E&dchild=1&keywords=iphone+12+pro&qid=1605603669&sprefix=Iphone+12%2Caps%2C175&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzRjAxN0xWNTk0TVpYJmVuY3J5cHRlZElkPUEwNzE4ODIxMktCWlhJMVlHWDFNMyZlbmNyeXB0ZWRBZElkPUExMDMwODk2Tk5OVkdZRTJISDVMJndpZGdldE5hbWU9c3BfYXRmJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ=='
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'lxml')
#I tried other parsing methods too: 'html.parser', 'html5lib'. Not helpful
title = soup.find(id="productTitle").get_text()
price = soup.find(id='priceblock_ourprice')
print(title) #returns correct string from the URL above
print(price)
#returns 'None'. Unexpected. Expecting price with some extensions from <span id="priceblock_ourprice"
Anyone who finds something wrong in my code would be really helpful for me.
Thanks in Advance!
Can not reproduce the 'None', code works fine, just added get_text() to the price and strip() both variables, to make the result a little bit cleaner.
import requests, time
from bs4 import BeautifulSoup
URL = 'https://www.amazon.de/Neues-Apple-iPhone-Pro-128-GB/dp/B08L5SNWD2/ref=sr_1_1_sspa?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&crid=3UH87RWLLO40E&dchild=1&keywords=iphone+12+pro&qid=1605603669&sprefix=Iphone+12%2Caps%2C175&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzRjAxN0xWNTk0TVpYJmVuY3J5cHRlZElkPUEwNzE4ODIxMktCWlhJMVlHWDFNMyZlbmNyeXB0ZWRBZElkPUExMDMwODk2Tk5OVkdZRTJISDVMJndpZGdldE5hbWU9c3BfYXRmJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ=='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Cache-Control': 'no-cache'
}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
#I tried other parsing methods too: 'html.parser', 'html5lib'. Not helpful
title = soup.find(id="productTitle").get_text().strip()
# to prevent script from crashing when there isn't a price for the product
try:
price = soup.find(id='priceblock_ourprice').get_text().strip()
#convert price to float by slicing
convertedPrice = price[:8]
except:
price = 'not loaded'
convertedPrice = 'not loaded'
print(title) #returns correct string from the URL above
print(price)
print(convertedPrice)
Output
Neues Apple iPhone 12 Pro (128 GB) - Graphit
1.120,00 €
1.120,00
But
As #Chase mentioned, if it is an dynamicly genereated content, you may give Selenium a try, this can handle the load with its Waits - By adding a delay, you can wait until page is loaded, dynamicly generated content to and then grap your information.
I tried to scrape a certain information from a webpage but failed miserably. The text I wish to grab is available in the page source but I still can't fetch it. This is the site address. I'm after the portion visible in the image as Not Rated.
Relevant html:
<div class="subtext">
Not Rated
<span class="ghost">|</span> <time datetime="PT188M">
3h 8min
</time>
<span class="ghost">|</span>
Drama,
Musical,
Romance
<span class="ghost">|</span>
<a href="/title/tt0150992/releaseinfo?ref_=tt_ov_inf" title="See more release dates">18 June 1999 (India)
</a> </div>
I've tried with:
import requests
from bs4 import BeautifulSoup
link = "https://www.imdb.com/title/tt0150992/?ref_=ttfc_fc_tt"
with requests.Session() as s:
s.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
rating = soup.select_one(".titleBar .subtext").next_element
print(rating)
I get None using the script above.
Expected output:
Not Rated
How can I get the rating from that webpage?
If you want to get correct version of HTML page, specify Accept-Language http header:
import requests
from bs4 import BeautifulSoup
link = "https://www.imdb.com/title/tt0150992/?ref_=ttfc_fc_tt"
with requests.Session() as s:
s.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
s.headers['Accept-Language'] = 'en-US,en;q=0.5' # <-- specify also this!
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
rating = soup.select_one(".titleBar .subtext").next_element
print(rating)
Prints:
Not Rated
There is a better way to getting info on the page. If you dump the html content returned by the request.
import requests
from bs4 import BeautifulSoup
link = "https://www.imdb.com/title/tt0150992/?ref_=ttfc_fc_tt"
with requests.Session() as s:
s.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
with open("response.html", "w", encoding=r.encoding) as file:
file.write(r.text)
you will find a element <script type="application/ld+json"> which contains all the information about the movie.
Then, you simply get the element text, parse it as json, and use the json to extract the info you wanted.
here is a working example
import json
import requests
from bs4 import BeautifulSoup
link = "https://www.imdb.com/title/tt0150992/?ref_=ttfc_fc_tt"
with requests.Session() as s:
s.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
movie_data = soup.find("script", attrs={"type": "application/ld+json"}).next # Find the element <script type="application/ld+json"> and get it's content
movie_data = json.loads(movie_data) # parse the data to json
content_rating = movie_data["contentRating"] # get rating
IMDB is one of those webpages that makes it incredible easy to do webscraping and I love it. So what they do to make it easy for webscrapers is to put a script in the top of the html that contains the whole movie object in the format of JSON.
So to get all the relevant information and organize it you simply need to get the content of that single script tag, and convert it to JSON, then you can simply ask for the specific information like with a dictionary.
import requests
import json
from bs4 import BeautifulSoup
#This part is basically the same as yours
link = "https://www.imdb.com/title/tt0150992/?ref_=ttfc_fc_tt"
r = requests.get(link)
soup = BeautifulSoup(r.content,"lxml")
#Why not get the whole json element of the movie?
script = soup.find('script', {"type" : "application/ld+json"})
element = json.loads(script.text)
print(element['contentRating'])
#Outputs "Not Rated"
# You can also inspect te rest of the json it has all the relevant information inside
#Just -> print(json.dumps(element, indent=2))
Note:
Headers and session are not necessary in this example.
How can I scrape latitude and longitude from the google results in the image below using beautiful soup.
Google result latitude longitude
Here is the code for do it with bs4:
from requests import get
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',}
response = get("https://www.google.com/search?q=latitude+longitude+of+75270+postal+code+paris+france",headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
a = soup.find("div", class_= "Z0LcW").text
print(a)
Please provide more input on further questions since we don't want to do the pre-work to create a solution.
You will have to grab this container:
<div class="HwtpBd gsrt PZPZlf" data-attrid="kc:/location/location:coordinates" aria-level="3" role="heading"><div class="Z0LcW XcVN5d">48.8573° N, 2.3370° E</div><div></div></div>
BS4
#BeautifoulSoup Stuff
import requests
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import re
# Make the request
url = "https://www.google.com/search?q=latitude+longitude+of+75270+postal+code+paris+france&rlz=1C1CHBF_deDE740DE740&oq=latitude+longitude+of+75270+postal+code+paris+france&aqs=chrome..69i57.4020j0j8&sourceid=chrome&ie=UTF-8"
response = requests.get(url)
# Convert it to proper html
html = response.text
# Parse it in html document
soup = BeautifulSoup(html, 'html.parser')
# Grab the container and its content
target_container = soup.find("div", {"class": "Z0LcW XcVN5d"}).text
Then you have a string inside the div returned.
..Assuming google doesn't change the class declarations randomly. I tried five refreshes and the classname didn't change, but who knows.
Make sure you're using user-agent (you can also use python fake user-agents library)
Code and replit.com that grabs location from Google Search results:
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get('https://www.google.com/search?q=latitude longitude of 75270 postal code paris france',
headers=headers).text
soup = BeautifulSoup(html, 'lxml')
location = soup.select_one('.XcVN5d').text
print(location)
Output:
48.8573° N, 2.3370° E
The question asked is very simple, but for me, it doesn't work and I don't know!
I want to scrape the rating beer from this page https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone with BeautifulSoup, but it doesn't work.
This is my code:
import requests
import bs4
from bs4 import BeautifulSoup
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
test_html = requests.get(url).text
soup = BeautifulSoup(test_html, "lxml")
rating = soup.findAll("span", class_="ratingValue")
rating
When I finish, it doesn't work, but if I do the same thing with another page is work... I don't know. Someone can help me? The result of rating is 4.58
Thanks everybody!
If you print the test_html, you'll find you get a 403 forbidden response.
You should add a header (at least a user-agent : ) ) to your GET request.
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
print(rating.text)
# 4.58
The reason behind getting forbidden status code (HTTP error 403) which means the server will not fulfill your request despite understanding the response. You will definitely get this error if you try scrape a lot of the more popular websites which will have security features to prevent bots. So you need to disguise your request!
For that you need use Headers.
Also you need correct your tag attribute whose data you're trying to get i.e. itemprop
use lxml as your tree builder, or any other of your choice
import requests
from bs4 import BeautifulSoup
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
# Add this
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'lxml')
rating = soup.find('span', {'itemprop':'ratingValue'})
print(rating.text)
The page you are requesting response as 403 forbidden so you might not be getting an error but it will provide you blank result as []. To avoid this situation we add user agent and this code will get you the desired result.
import urllib.request
from bs4 import BeautifulSoup
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = "https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone"
headers={'User-Agent':user_agent}
request=urllib.request.Request(url,None,headers) #The assembled request
response = urllib.request.urlopen(request)
soup = BeautifulSoup(response, "lxml")
rating = soup.find('span', {'itemprop':'ratingValue'})
rating.text
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36
(KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southerntier-pumking
clone'
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
print(rating.text)
you are facing this error because some websites can't be scraped by beautiful soup. So for these kinds of websites, you have to use selenium
download latest chrome driver from this link according to your operating system
install selenium driver by this command "pip install selenium"
# import required modules
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time, os
curren_dir = os.getcwd()
print(curren_dir)
# concatinate web driver with your current dir && if you are using window change "/" to '\' .
# make sure , you placed chromedriver in current directory
driver = webdriver.Chrome(curren_dir+'/chromedriver')
# driver.get open url on your browser
driver.get('https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone')
time.sleep(1)
# it fetch data html data from driver
super_html = driver.page_source
# now convert raw data with 'html.parser'
soup=BeautifulSoup(super_html,"html.parser")
rating = soup.findAll("span",itemprop="ratingValue")
rating[0].text