Extracting link from soup python

Extracting link from soup python - python

I'm trying make an app gets the source links on bandcamp but im kinda stuck. Is there a way to get the source link with beautifulsoup.
The link im trying to get
Bandcamp

The data is within the <script> tags in json format. So use BeautifulSoup to get the 'script'. The data you are after is in the data-tralbum attribute.
Onece you get thet, have json read it in, then just iterate through the json structure:
from bs4 import BeautifulSoup
import requests
import json
url = 'https://vine.bandcamp.com/album/another-light'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
script = str(soup.find_all('script')[4]['data-tralbum'])
jsonData = json.loads(script)
trackinfo = jsonData['trackinfo']
links = []
for each in trackinfo:
links.append(each['file']['mp3-128'])
Output:
print(links)
['https://t4.bcbits.com/stream/efbba461835eff472bd04a2f9e9910a9/mp3-128/1761020287?p=0&ts=1638288735&t=8ae6343808036ab513cd5436ea009e5d0de784e4&token=1638288735_9139d56ec86f2d44b83a11f3eed8caf7075d6039', 'https://t4.bcbits.com/stream/3e5ef92e6d83e853958ed01955c95f5f/mp3-128/1256475880?p=0&ts=1638288735&t=745a6c701cf1c5772489da5467f6cae5d3622818&token=1638288735_7e86a32c635ba92e0b8320ef56a457d988286cff', 'https://t4.bcbits.com/stream/bbb49d4a72cb80feaf759ec7890abbb6/mp3-128/3439518541?p=0&ts=1638288735&t=dcc7ef7d1d7823e227339fb3243385089478ebe7&token=1638288735_5db36a29c58ea038828d7b34b67e13bd80597dd8', 'https://t4.bcbits.com/stream/8c8a69959337f6f4809f6491c2822b45/mp3-128/1330130896?p=0&ts=1638288735&t=d108dac84dfaac901a546c5fcf5064240cca376b&token=1638288735_8d9151aa82e7a00042025f924660dd3a093c2f74', 'https://t4.bcbits.com/stream/4d4253633405f204d7b1c101379a73be/mp3-128/2478242466?p=0&ts=1638288735&t=a8cd539d0ce8ff417f9b69740070870ed9a182a5&token=1638288735_ad8b5e93c8ffef6623615ce82a6754678fa67b67', 'https://t4.bcbits.com/stream/6c4feee38e289aea76080e9ddc997fa5/mp3-128/2243532902?p=0&ts=1638288735&t=83417c3aba0cef0f969f93bac5165e582f24a588&token=1638288735_c1d9d43b4e10cc6d02c822de90eda3a52c382df2', 'https://t4.bcbits.com/stream/a24dc5dad7b619d47b006e26084ff38f/mp3-128/3054008347?p=0&ts=1638288735&t=4563c326a272c9f5b8462fef1d082e46fac7f605&token=1638288735_55978e7edbe0410ff745913224b8740becad59d5', 'https://t4.bcbits.com/stream/6221790d7f55d3b1f006bd5fac5458fe/mp3-128/1500140939?p=0&ts=1638288735&t=9ecc210c53af05f4034ee00cd1a96a043312a4a7&token=1638288735_0f2faba41da8952f841669513d04bdaaae35a629', 'https://t4.bcbits.com/stream/030506909569626a0d2d7d182b61c691/mp3-128/1707615013?p=0&ts=1638288735&t=c8dcbb2c491789928f5cb6ef8b755df999cb58b8&token=1638288735_b278ba825129ae1b5588b47d5cda345ef2db4e58', 'https://t4.bcbits.com/stream/d1ae0cbc281fc81ddd91f3a3e3d80973/mp3-128/2808772965?p=0&ts=1638288735&t=1080ff51fc40bb5b7afb3a2460f3209cbda549e3&token=1638288735_c93249c847acba5cf23521fa745e05b426a5ba05', 'https://t4.bcbits.com/stream/1b9d50f8210bdc3cf4d2e33986f319ae/mp3-128/2751220220?p=0&ts=1638288735&t=9f24f06dfc5c8a06f24f28664438a6f1a75a038c&token=1638288735_f3a98a20b3c344dc5a37a602a41572d5fe8539c1', 'https://t4.bcbits.com/stream/203cd15629ba03e3249f850d5e1ac42e/mp3-128/4188265472?p=0&ts=1638288735&t=4b4bc2f2194c63a1d3b957e3dd6046bd764c272a&token=1638288735_53a70e7d83ce8c2800baeaf92a5c19db4e146e3f', 'https://t4.bcbits.com/stream/c63b5c9ca090b233e675974c7e7ee4b2/mp3-128/258670123?p=0&ts=1638288735&t=a81ae9dc33dea2b2660d13dbbec93dbcb06e6b63&token=1638288735_446d0ae442cbbadbceb342fe4f7b69d0fbab2928', 'https://t4.bcbits.com/stream/2e824d3c643658c8e9e24b548bc8cb0b/mp3-128/2332945345?p=0&ts=1638288735&t=5bdf0264b9ffe4616d920c55f5081744bf0822d4&token=1638288735_872191bb67a3438ef0fd1ce7e8a9e5ca09e6c37e']

Related

How to scrape video URL from Webpage using python?

I want to download videos from a website.
Here is my code.
Every time when i run this code, it returns blank file.
Here is live code: https://colab.research.google.com/drive/19NDLYHI2n9rG6KeBCiv9vKXdwb5JL9Nb?usp=sharing
from bs4 import BeautifulSoup
import requests
url = requests.get("https://www.mxtakatak.com/xt0.3a7ed6f84ded3c0f678638602b48bb1b840bea7edb3700d62cebcf7a400d4279/video/20000kCCF0")
page = url.content
soup = BeautifulSoup(page, "html.parser")
#print(soup.prettify())
result = soup.find_all('video', class_="video-player")
print(result)

using Regex
import requests
import re
response = requests.get("....../video/20000kCCF0")
videoId = '20000kCCF0'
videos = re.findall(r'https://[^"]+' + videoId + '[^"]+mp4', response.text)
print(videos)

You always get a blank return because soup.find_all() doesn't find anything.
Maybe you should check the url.content you receive by hand and then decide what to look for with find_all()
EDIT: After digging a bit around I found out how to get the content_url_orig:
from bs4 import BeautifulSoup
import requests
import json
url = requests.get("https://www.mxtakatak.com/xt0.3a7ed6f84ded3c0f678638602b48bb1b840bea7edb3700d62cebcf7a400d4279/video/20000kCCF0")
page = url.content
soup = BeautifulSoup(page, "html.parser")
result = str(soup.find_all('script')[1]) #looking for script tag inside the html-file
result = result.split('window._state = ')[1].split("</script>']")[0].split('\n')[0]
#separating the json from the whole script-string, digged around in the file to find out how to do it
result = json.loads(result)
#navigating in the json to get the video-url
entity = list(result['entities'].items())[0][1]
download_url = entity['content_url_orig']
print(download_url)
Funny sidenote: If I read the JSON correctly you can find all videos with download-URLs the creator uploaded :)

Why am I unable to web-scrape URL from a hyperlink in this website?

I tried to extract URL from a hyperlink in this web: https://riwayat-file-covid-19-dki-jakarta-jakartagis.hub.arcgis.com/
I used the following Python code:
import requests
from bs4 import BeautifulSoup
url = "https://riwayat-file-covid-19-dki-jakarta-jakartagis.hub.arcgis.com/"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup.prettify())
links = soup.find_all('a')
for link in links:
if "href" in link.attrs:
print(str(link.attrs['href'])+"\n")
The problem is this code does not return any URL.
I want to get all of this urls:

You are unable to parse it as the data is dynamically loaded. As you can see in the following image, the HTML data that is being written to the page doesn't actually exist when you download the HTML source code. The JavaScript later parses the window.__SITE variable and extracts the data from there:
However, we can replicate this in Python. After downloading the page:
import requests
url = "https://riwayat-file-covid-19-dki-jakarta-jakartagis.hub.arcgis.com/"
req = requests.get(url)
You can use re (regex) to extract the encoded page source:
import re
encoded_data = re.search("window\.__SITE=\"(.*)\"", req.text).groups()[0]
Afterwards, you can use urllib to URL-decode the text, and json to parse the JSON string data:
from urllib.parse import unquote
from json import loads
json_data = loads(unquote(encoded_data))
You can then parse the JSON tree to get to the HTML source data:
html_src = json_data["site"]["data"]["values"]["layout"]["sections"][1]["rows"][0]["cards"][0]["component"]["settings"]["markdown"]
At that point, you can use your own code to parse the HTML:
soup = BeautifulSoup(html_src, 'html.parser')
print(soup.prettify())
links = soup.find_all('a')
for link in links:
if "href" in link.attrs:
print(str(link.attrs['href'])+"\n")
If you put it all together, here's the final script:
import requests
import re
from urllib.parse import unquote
from json import loads
from bs4 import BeautifulSoup
# Download URL
url = "https://riwayat-file-covid-19-dki-jakarta-jakartagis.hub.arcgis.com/"
req = requests.get(url)
# Get encoded JSON from HTML source
encoded_data = re.search("window\.__SITE=\"(.*)\"", req.text).groups()[0]
# Decode and load as dictionary
json_data = loads(unquote(encoded_data))
# Get the HTML source code for the links
html_src = json_data["site"]["data"]["values"]["layout"]["sections"][1]["rows"][0]["cards"][0]["component"]["settings"]["markdown"]
# Parse it using BeautifulSoup
soup = BeautifulSoup(html_src, 'html.parser')
print(soup.prettify())
# Get links
links = soup.find_all('a')
# For each link...
for link in links:
if "href" in link.attrs:
print(str(link.attrs['href'])+"\n")

The links are generated dynamically by javascript code and the data can be found un the structure below.
<script id="site-injection">
window.__SITE="your data is here"
</script>
So you need to grab this script element and parse the value of window.__SITE

Webscraping output []

Hey i just wanted to test Python Webscraping and i have no Idea why this doesn't work.
As output i become [] and nothing else.
Has anyone an Idea? BEcause if i go to the Website and search for the element i find it.
from bs4 import BeautifulSoup
import requests
html_text = requests.get("https://osu.ppy.sh/users/20488254").text
soup = BeautifulSoup(html_text, "lxml")
job = soup.find("div", class_ = "profile-detail__col profile-detail__col--bottom-right")
print(job)

Player info is loaded dynamically with JS. So, you can't scrape dynamic content using plain bs4. Luckily, they provide user info in json format inside script tag. If you open page source and look for json-user you will see there is a tag:
<script id="json-user" type="application/json">
{"avatar_url":"https:\/\/a.ppy.sh\/20488254?1622470835.jpeg","country_code":"AT","default_group":"default","id":20488254,...
</script>
You can grab json inside that tag and get any information about player. Here is how it would look like:
import json
import requests
from bs4 import BeautifulSoup
html_text = requests.get("https://osu.ppy.sh/users/20488254").text
soup = BeautifulSoup(html_text, "lxml")
json_data = json.loads(soup.find('script', {'id':'json-user'}).string)
Now let's say you are looking for player's global rank. All you need to do is to find the correct keys to navigate you there:
player_rank = json_data['statistics']['global_rank']
# -> 199303

How to make beautiful soup grab only what is between a set of "[:" ":]" in a web page?

Good afternoon! How do I make Beautifulsoup grab only what is between multiple sets of "[:" and ":]" So far I have got the entire page in my soup, but it does not have tags, sadly.
What it looks like so far
I have tried a couple of things so far:
soup.findAll(text="[")
keys = soup.find("span", attrs = {"class": "objectBox objectBox-string"})
import bs4 as bs
import urllib.request
source = urllib.request.urlopen("https://login.microsoftonline.com/common/discovery/keys").read()
soup = bs.BeautifulSoup(source,'lxml')
# ---------------------------------------------
# prior script that I was playing with trying to tackle this issue
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set URL to scrape new certs from
newcerts = "https://login.microsoftonline.com/common/discovery/keys"
# Connect to the URL
response = requests.get(newcerts)
# Parse HTML and save to BeautifulSoup Object
soup = BeautifulSoup(response.text, "html.parser")
keys = soup.find("span", attrs = {"class": "objectBox objectBox-string"})
End goal is to retrieve the public PKI keys from Azure's website at https://login.microsoftonline.com/common/discovery/keys

Not sure if this is what you meant to grab. Try the script below:
import json
import requests
url = 'https://login.microsoftonline.com/common/discovery/keys'
res = requests.get(url)
jsonobject = json.loads(res.content)
for item in jsonobject['keys']:
print(item['x5c'])

Scraping JSON data from e-commerce Ajax site with Python

Previously, I posted a question on how do I get the data from an AJAX website which is from this link: Scraping AJAX e-commerce site using python
I understand a bit on how to get the response which is using the chrome F12 in Network tab and do some coding with python to display the data. But I barely can't find the specific API url for it. The JSON data is not coming from a URL like the previous website, but it is in the Inspect Element in Chrome F12.
My real question actually is how do I get ONLY the JSON data using BeautifulSoup or anything related to it? After I can get only the JSON data from the application/id+json then I will convert it to be a JSON data that python can recognize so that I can display the products into table form.
One more problem is after several time I run the code, the JSON data is missing. I think the website will block my IP address. How to I solve this problem?
Here is the website link:
https://www.lazada.com.my/catalog/?_keyori=ss&from=input&page=1&q=h370m&sort=priceasc
Here is my code
from bs4 import BeautifulSoup import requests
page_link =
'https://www.lazada.com.my/catalog/?_keyori=ss&from=input&page=1&q=h370m&sort=priceasc'
page_response = requests.get(page_link, timeout=5)
page_content = BeautifulSoup(page_response.content, "html.parser")
print(page_content)

You can just use the find method with the pointer to your <script> tag with the attr type=application/json
Then you can use the json package to load the value inside a dict
Here is a code sample:
from bs4 import BeautifulSoup as soup
import requests
import json
page_link = 'https://www.lazada.com.my/catalog/?_keyori=ss&from=input&page=1&q=h370m&sort=priceasc'
page_response = requests.get(page_link, timeout=5)
page_content = soup(page_response.text, "html.parser")
json_tag = page_content.find('script',{'type':'application/json'})
json_text = json_tag.get_text()
json_dict = json.loads(json_text)
print(json_dict)
EDIT: My bad, I didn't see you search type=application/ld+json attr
As it seems to have several <script>with this attr, you can simply use the find_all method:
from bs4 import BeautifulSoup as soup
import requests
import json
page_link = 'https://www.lazada.com.my/catalog/?_keyori=ss&from=input&page=1&q=h370m&sort=priceasc'
page_response = requests.get(page_link, timeout=5)
page_content = soup(page_response.text, "html.parser")
json_tags = page_content.find_all('script',{'type':'application/ld+json'})
for jtag in json_tags:
json_text = jtag.get_text()
json_dict = json.loads(json_text)
print(json_dict)

You will have to parse data from HTML manually from your Soup as other websites will restrict their json API from other parties.
You can find out more details here in the documentation:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/

Try:
import requests
response = requests.get(url)
data = response.json()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extracting link from soup python - python

I'm trying make an app gets the source links on bandcamp but im kinda stuck. Is there a way to get the source link with beautifulsoup. The link im trying to get Bandcamp

Related

How to scrape video URL from Webpage using python?

Why am I unable to web-scrape URL from a hyperlink in this website?

Webscraping output []

How to make beautiful soup grab only what is between a set of "[:" ":]" in a web page?

Scraping JSON data from e-commerce Ajax site with Python

Categories

Resources