How to extract a specific string with BeautifulSoup

How to extract a specific string with BeautifulSoup - python

So I'm trying to retrieve Bitcoin prices from CoinMarketCap.com.
I'm using Python along with requests and bs4.
import requests
from bs4 import BeautifulSoup
link = "https://coinmarketcap.com/currencies/bitcoin/"
header = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'}
data = requests.get(headers = header, url = link)
soup = BeautifulSoup(data.content, 'html.parser')
bitcoinPrice = soup.find(id="quote_price")
print(bitcoinPrice)
So when I run the script, I have the following result with some additional code that I don't want. I just want the Bitcoin price.
<span data-currency-price="" data-usd="9806.68980398" id="quote_price">
<span class="h2 text-semi-bold details-panel-item--price__value" data-currency-value="">9806.69</span>
<span class="text-large" data-currency-code="">USD</span>
</span>
How do I extract the Bitcoin price from that chunk of data?

I believe this should give you what you want:
bitcoinPrice.span.contents[0]
contains
'9808.16'

bitcoinPrice = soup.find("span", class_="details-panel-item--price__value").text

This is another way using css selector.
print(soup.select_one('.details-panel-item--price__value').text)

You can use the official API under the basic (free) plan and then simply add your API key into below. Code example updated from here.
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/quotes/latest'
parameters = {
'id':'1'
}
headers = {
'Accepts': 'application/json',
'X-CMC_PRO_API_KEY': 'api_key',
}
session = Session()
session.headers.update(headers)
try:
response = session.get(url, params=parameters)
data = json.loads(response.text)
#print(data)
print(data['data']['1']['quote']['USD']['price'])
except (ConnectionError, Timeout, TooManyRedirects) as e:
print(e)

Related

Getting a json attribute from URL

So here's my script:
import requests
import urllib
import json
url = 'https://www.homedepot.com/p/ZLINE-Kitchen-and-Bath-36-DuraSnow-Stainless-Steel-Range-Hood-with-Hand-Hammered-Copper-Shell-8654HH-36-8654HH-36/311287560'
response = json.loads(requests.get(url).text)
print(response["offers"])
and after grabbing the page source of https://www.homedepot.com/p/ZLINE-Kitchen-and-Bath-36-DuraSnow-Stainless-Steel-Range-Hood-with-Hand-Hammered-Copper-Shell-8654HH-36-8654HH-36/311287560
I want to grab this data
"offers":{"#type":"Offer","url":"https://www.homedepot.com/p/ZLINE-Kitchen-and-Bath-36-DuraSnow-Stainless-Steel-Range-Hood-with-Hand-Hammered-Copper-Shell-8654HH-36-8654HH-36/311287560","priceCurrency":"USD","price":1449.95,"priceValidUntil":"4/7/2021","availability":"https://schema.org/InStock"}
More specifically, price and priceValidUntil
from some googling I think this would be the way to do it but since there's so much data within the webpage I think it is taking my script a ton of time to run.
Is there a more efficient way of getting this json data and am I grabbing this data correctly?

You can use this example how to load the json data from HTML page:
import json
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0"
}
url = "https://www.homedepot.com/p/ZLINE-Kitchen-and-Bath-36-DuraSnow-Stainless-Steel-Range-Hood-with-Hand-Hammered-Copper-Shell-8654HH-36-8654HH-36/311287560"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
data = json.loads(
soup.select_one('script[type="application/ld+json"]').contents[0]
)
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
print("Price:", data["offers"]["price"])
print("Price valid until:", data["offers"]["priceValidUntil"])
Prints:
Price: 1449.95
Price valid until: 4/8/2021

Scraping product URLs under specific ZIP Code

I'm trying to scrape the product links under ZIP code 08041. I have written the code to scrape the products without ZIP code but don't know how to scrape and send the request fror the products under 08041?
Here is my code:
import requests
import random
import time
from bs4 import BeautifulSoup
import wget
import csv
from fp.fp import FreeProxy
def helloworld(url):
r = requests.get(url)
print ('Status',r.status_code)
#time.sleep(8)
soup = BeautifulSoup(r.content,'html.parser')
post = soup.find_all('a',"name")
for href in post:
if ( href.get('href')[1] == 'p'):
href = href.get('href')
print (href)
def page_counter():
url1 = "https://soysuper.com/c/aperitivos#products"
print (url1,'\n')
helloworld(url1)
page_counter()

You can use the back-end end-points to mimic a request with a given zip code.
Note: The cookie is hard-coded but valid for a year.
Here's how:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Cookie": "soysuper=eyJjYXJ0IjoiNjA2NWNkMzg5ZDI5YzkwNDU1NjI3MzYzIiwiZXhwaXJlcyI6MTY0ODg0MTMzOSwib3JpZCI6IkM2NzgwOUYyLTkyRUYtMTFFQi04NjNELTgzMTBCMUUwMTM2NiIsInNtIjoiIiwidXVpZCI6IkIwQjYxQzRFLTkyRUYtMTFFQi05MjRCLTA5MTFCMUUwMTM2NiIsIndoIjpbIjU0MDQ5MjEwMDk1Y2ZhNTQ2YzAwMDAwMCIsIjRmZjMwZTZhNTgzMmU0OGIwMjAwMDAwMCIsIjU5Y2JhZmE2OWRkNGU0M2JmMzIwODM0MiIsIjRmMzEyNzU4ZTNjNmIzMDAzMjAwMDAwMCIsIjVhMTZmNjdhMjUwOGMxNGFiMzE0OTY4MyIsIjYwMjQxNTEzNzIyZDZhNTZkNDZlMjhmNyIsIjRmZjMwZTJkYzI3ZTk1NTkwMjAwMDAwMSIsIjU5ZjcxYTZlNjI4YWIwN2UyYjJjZmJhMSIsIjU5Y2JhZjNjOWRkNGU0M2JmMzIwODM0MSIsIjVhMGU0NDFhNTNjOTdiM2UxNDYyOGEzNiIsIjRmMmJiZmI3ZWJjYjU1OGM3YjAwMDAwMCIsIjYwNDExZjJlNzIyZDZhMTEyZDVjYTNlYiIsIjViMWZmZjAyNzI1YTYxNzBjOTIxMjc0MSIsIjVlNzk2NWUwZDc5MTg3MGU0NTA1MGMwMCIsIjVkMTI0NDQ2OWRkNGU0NGFkMDU3MmMxMSJdLCJ6aXAiOiIwODA0MSJ9--166849121eece159a6fdb0c0fe8341032321d9b1;"
}
with requests.Session() as connection:
r = connection.get("https://soysuper.com/supermarket?zipcode=08041", headers=headers)
headers["Request-Id"] = r.headers["Next-Request-Id"]
headers["Referer"] = "https://soysuper.com/c/aperitivos"
products_data = connection.get("https://soysuper.com/c/aperitivos?products=1&page=1", headers=headers).json()
print(products_data["products"]["total"])
Output: Total number of products for 08041 zip code.
2923
What you're effectivly getting is a JSON with all the product data for a given page. This is what it looks like in the Network tab.
Do notice the pager key. Use it to "paginate" the API and get more product info.

HTMLs not found by BeautifulSoup

I'm trying to write a program, that downloads the most upvoted picture from a subreddit, but for some reason the BeautifulSoup does not find all the links on a website, I know I could try it with other methods but I'm curious why isn't it finding all the link every time.
Here is the code as well.
from PIL import Image
import requests
from bs4 import BeautifulSoup
url = 'https://www.reddit.com/r/wallpaper/top/'
result = requests.get(url)
soup = BeautifulSoup(result.text,'html.parser')
for link in soup.find_all('a'):
print (link.get('href'))

Site is loaded with JavaScript, bs4 will not be able to render JavaScript therefor, I've been able to locate the data within script tag.
import requests
import re
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def main(url):
r = requests.get(url, headers=headers)
match = re.search(r"window.___r = ({.+})", r.text).group(1)
data = json.loads(match)
# print(data.keys())
# humanreadable = json.dumps(data, indent=4)
main("https://www.reddit.com/r/wallpaper/top/")
Shorter version:
match = re.finditer(r'permalink":"(.+?)"', r.text)
for item in match:
print(item.group(1))
Output:
https://www.reddit.com/r/wallpaper/comments/fv9ubr/khyber_pakhtunkhwa_pakistan_balakot_1920x1024/
https://www.reddit.com/user/wsopgame/comments/fvbxom/join_the_official_wsop_online_poker_game_and/
https://www.reddit.com/user/wsopgame/comments/fvbxom/join_the_official_wsop_online_poker_game_and/?instanceId=t3_p%3DgAAAAABeiiTtw4FM0zBerf9DDiq5tmonjJbAwzQb_UwA-VHlw2J8zUxw-y6Doa6j-jPP0qt05lRZfyReQwnLH9pN6wdSBBvqhgxgRS3uKyKCRvkk6WNwns5wpad0ijMgHwqVnZSGMT0KWP4WB15zBNkb3j96ifm23pT4uACb6cpNVh-TE05GiTtDnD9UUMir02Z7hOr0x4f_wLJEIplafXRp2yiAFPh5VzH_4VSsPx9zV7v3IJwN5ctYLfIcdCW5Z3W-z3bbOVUCU2HqqRAoh0XEj0LrgdicMexa9fzPbtWOshfx3kIazwFhYXoSowPBRZUquSs9zEaQwP1B-wg951edNb7RSjYTrDpQ75zsMfIkasKvAOH-V58%3D
https://www.reddit.com/r/wallpaper/comments/fv6wew/lone_road_in_nowhere_arizona_1920x1080/
https://www.reddit.com/r/wallpaper/comments/fvaqaa/the_hobbit_house_1920_x_1080/
https://www.reddit.com/r/wallpaper/comments/fvcs4j/something_i_made_in_illustrator_5120_2880/
https://www.reddit.com/r/wallpaper/comments/fv09u2/bath_time_in_rocky_mountain_national_park_1280x720/
https://www.reddit.com/r/wallpaper/comments/fuyomz/up_is_still_my_favorite_film_grandpa_carl_cams/
https://www.reddit.com/r/wallpaper/comments/fvagex/beautiful_and_colorful_nature_wallpaper_1920x1080/
https://www.reddit.com/r/wallpaper/comments/fv3nnn/maroon_bells_co_photo_credit_to/
https://www.reddit.com/r/wallpaper/comments/fuyg0z/volcano_lightening_19201080/
https://www.reddit.com/r/wallpaper/comments/fvgohk/doctor_strange1920x1080/
https://www.reddit.com/user/redditads/comments/ezogdp/reach_your_audience_on_reddit/
https://www.reddit.com/user/redditads/comments/ezogdp/reach_your_audience_on_reddit/?instanceId=t3_p%3DgAAAAABeiiTt9isPY03zwoimtzcC7w3uLzUDCuoD5cU6ekeEYt48cRAqoMsc1ZDBJ6OeK1U3Bs2Zo1ZSWzdQ4DOux21vGvWzJkxNWQ14XzDWag_GlrE-t_4rpFA_73kW94xGUQchsXL7f4VkbbHIyn8SMlUlTtt3j3lJCViwINOQgIF3p5N8Q4ri-swtJC-JyEUYa4dJazlZ9xLYyOHSvMkiR3k9lDx0NEKqpqfbQ9__f3xLUzgS4yF4OngMDFUVFa5nyH3I32mkP3KezXLxOR6H8CSGI_jqRA4dBV-AnHLuzPlgENRpfaMhWJ04vTEOjmG4sm4xs65OZCumqNstzlDEvR7ryFwL6LeH02a9E3czck5jfKY7HXQ%3D
https://www.reddit.com/r/wallpaper/comments/fuzjza/ghost_cloud_1280x720/
https://www.reddit.com/r/wallpaper/comments/fvg88o/park_autumn_tress_wallpaper_1920x1080/
https://www.reddit.com/r/wallpaper/comments/fv47r8/audi_quattro_s1_3840x2160_fh4/
https://www.reddit.com/r/wallpaper/comments/fuybjs/spacecrafts_1920_x_1080/
https://www.reddit.com/r/wallpaper/comments/fv043i/dragonfly_1280x720/
https://www.reddit.com/r/wallpaper/comments/fv06ud/muskrat_swim_1280x720/
https://www.reddit.com/r/wallpaper/comments/fvdafk/natural_beauty_1920x1080/
https://www.reddit.com/r/wallpaper/comments/fvbnuc/cigar_man_19201080/
https://www.reddit.com/r/wallpaper/comments/fvcww4/thunder_road_3840_x_2160/
https://www.reddit.com/user/redditads/comments/7w17su/interested_in_gaining_a_new_perspective_on_things/
https://www.reddit.com/user/redditads/comments/7w17su/interested_in_gaining_a_new_perspective_on_things/?instanceId=t3_p%3DgAAAAABeiiTtxVzGp9KwvtRNa1pOVCgz2IBkTGRxqdyXk4WTsjAkWS9wzyDVF_1aSOz36HqHOVrngfj3z_9O1cAkzz-0fwhxyJ_8jePT3F88mrveLChf_YRIbAtxb-Ln_OaeeXUnyrFVl-OPN7cqXvtgh3LoymBx3doL-bEVnECOWkcSXvUIwpMn-flVZ5uNcGL1nKEiszUcORqq1oQ32BnrmWHomrDb3Q%3D%3D
https://www.reddit.com/r/wallpaper/comments/fv3xqs/social_distancing_log_1920x1080/
https://www.reddit.com/r/wallpaper/comments/fvbcpl/neon_city_wallpaper_19201080/
https://www.reddit.com/r/wallpaper/comments/fvbhdb/sunrise_wallpaper_19201080/
https://www.reddit.com/r/wallpaper/comments/fv2eno/second_heavy_bike_in_ghost_recon_breakpoint/

scraping google search results page data python

i want to scrape emails on search resulted query. but when i access to class with css selecter "select" and print it always shows empty list. How can i access .r class or "class=g"?
import requests
from bs4 import BeautifulSoup
url = "https://www.google.com/search?sxsrf=ACYBGNQA4leQETe0psVZPu7daLWbdsc9Ow%3A1579194494737&ei=fpggXpvRLMakwQKkqpSICg&q=%22computer+science+%22%22usa%22+%22%40yahoo.com%22&oq=%22computer+science+%22%22usa%22+%22%40yahoo.com%22&gs_l=psy-ab.12...0.0..7407...0.0..0.0.0.......0......gws-wiz.82okhpdJLYg&ved=0ahUKEwibiI_3zYjnAhVGUlAKHSQVBaEQ4dUDCAs"
responce = requests.get(url)
soup = BeautifulSoup(responce.text, "html.parser")
test = soup.select('.r')
print(test)

Your program is correct, but to get correct answer from Google, you need to specify User-Agent header:
import requests
from bs4 import BeautifulSoup
url = "https://www.google.com/search?sxsrf=ACYBGNQA4leQETe0psVZPu7daLWbdsc9Ow%3A1579194494737&ei=fpggXpvRLMakwQKkqpSICg&q=%22computer+science+%22%22usa%22+%22%40yahoo.com%22&oq=%22computer+science+%22%22usa%22+%22%40yahoo.com%22&gs_l=psy-ab.12...0.0..7407...0.0..0.0.0.......0......gws-wiz.82okhpdJLYg&ved=0ahUKEwibiI_3zYjnAhVGUlAKHSQVBaEQ4dUDCAs"
headers = {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0'}
responce = requests.get(url, headers=headers) # <-- specify custom header
soup = BeautifulSoup(responce.text, "html.parser")
test = soup.select('.r')
print(test)
Prints:
[<div class="r"><a href="https://www.yahoo.com/news/11-course-complete-computer-science-171322233.html" onmousedown="return rwt(this,'','','','1','AOvVaw2wM4TUxc_4V7s9GjeWTNAG','','2ahUKEwjt17Kk-YjnAhW2R0EAHcnsC3QQFjAAegQIAxAB','','',event)"><div class="TbwUpd"><img alt="https://...
...

To get the emails out of the Google Search results you need to use regex
# this regex needs possible modifications
re.findall(r'[\w\.-]+#[\w\.-]+\.\w+', variable_where_to_search_from)
Code:
from bs4 import BeautifulSoup
import requests, lxml, re
headers = {
"User-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
"Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get('https://www.google.com/search?q="computer science ""usa" "#yahoo.com"', headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select('.tF2Cxc'):
try:
snippet = result.select_one('.lyLwlc').text
except:
snippet = None
match_email = re.findall(r'[\w\.-]+#[\w\.-]+\.\w+', str(snippet))
email = '\n'.join(match_email).strip()
print(email)
----------
'''
ahmed_733#yahoo.com
yjzou#uguam.uog
yzou2002#yahoo.com
...
Alternatively, you can do the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
It doesn't extract emails using regex although it would be a great possible feature. The main difference is that much easier and faster to get things done rather than creating everything from scratch.
Code to integrate:
from serpapi import GoogleSearch
import re
params = {
"api_key": "YOUR_API_KEY",
"engine": "google",
"q": '"computer science ""usa" "#yahoo.com"',
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['organic_results']:
try:
snippet = result['snippet']
except:
snippet = None
match_email = re.findall(r'[\w\.-]+#[\w\.-]+\.\w+', str(snippet))
email = '\n'.join(match_email).strip()
print(email)
---------
'''
shaikotweb#yahoo.com
ahmed_733#yahoo.com
RPeterson#L1id.com
rj_peterson#yahoo.com
'''
Disclaimer, I work for SerpApi.

requests-html not finding page element

So I'm trying to navigate to this url: https://www.instacart.com/store/wegmans/search_v3/horizon%201%25
and scrape data from the div with the class item-name item-row. There are two main problems though, the first is that instacart.com requires a login before you can get to that url, and the second is that most of the page is generated with javascript.
I believe I've solved the first problem because my session.post(...) gets a 200 response code. I'm also pretty sure that r.html.render() is supposed to solve the second problem by rendering the javascript generated html before I scrape it. Unfortunately, the last line in my code is only returning an empty list, despite the fact that selenium had no problem getting this element. Does anyone know why this isn't workng?
from requests_html import HTMLSession
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
session = HTMLSession()
res1 = session.get('http://www.instacart.com', headers=headers)
soup = BeautifulSoup(res1.content, 'html.parser')
token = soup.find('meta', {'name': 'csrf-token'}).get('content')
data = {"user": {"email": "alexanderjbusch#gmail.com", "password": "password"},
"authenticity_token": token}
response = session.post('https://www.instacart.com/accounts/login', headers=headers, data=data)
print(response)
r = session.get("https://www.instacart.com/store/wegmans/search_v3/horizon%201%25", headers=headers)
r.html.render()
print(r.html.xpath("//div[#class='item-name item-row']"))

After logging in using requests module and BeautifulSoup, you can make use of the link I've already suggested in the comment to parse the required data available within json. The following script should get you name, quantity, price and a link to the concerning product. You can only get 21 product using the script below. There is an option for pagination within this json content. You can get all of the products by playing around with that pagination.
import json
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.instacart.com/store/'
data_url = "https://www.instacart.com/v3/retailers/159/module_data/dynamic_item_lists/cart_starters/storefront_canonical?origin_source_type=store_root_department&tracking.page_view_id=b974d56d-eaa4-4ce2-9474-ada4723fc7dc&source=web&cache_key=df535d-6863-f-1cd&per=30"
data = {"user": {"email": "alexanderjbusch#gmail.com", "password": "password"},
"authenticity_token": ""}
headers = {
'user-agent':'Mozilla/5.0',
'x-requested-with': 'XMLHttpRequest'
}
with requests.Session() as s:
res = s.get('https://www.instacart.com/',headers={'user-agent':'Mozilla/5.0'})
soup = BeautifulSoup(res.text, 'lxml')
token = soup.select_one("[name='csrf-token']").get('content')
data["authenticity_token"] = token
s.post("https://www.instacart.com/accounts/login",json=data,headers=headers)
resp = s.get(data_url, headers=headers)
for item in resp.json()['module_data']['items']:
name = item['name']
quantity = item['size']
price = item['pricing']['price']
product_page = baseurl + item['click_action']['data']['container']['path']
print(f'{name}\n{quantity}\n{price}\n{product_page}\n')
Partial output:
SB Whole Milk
1 gal
$3.90
https://www.instacart.com/store/items/item_147511418
Banana
At $0.69/lb
$0.26
https://www.instacart.com/store/items/item_147559922
Yellow Onion
At $1.14/lb
$0.82
https://www.instacart.com/store/items/item_147560764

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to extract a specific string with BeautifulSoup - python

I believe this should give you what you want: bitcoinPrice.span.contents[0] contains '9808.16'

bitcoinPrice = soup.find("span", class_="details-panel-item--price__value").text

This is another way using css selector. print(soup.select_one('.details-panel-item--price__value').text)

Related

Getting a json attribute from URL

Scraping product URLs under specific ZIP Code

HTMLs not found by BeautifulSoup

scraping google search results page data python

requests-html not finding page element

Categories

Resources