I am trying to get the first result from the youtube search query by scraping the page and open the page in the browser! This code does not work! And I am having problems finding the right class, id or selector to get the link.
I have tried to use tags but some them return none.
import webbrowser
import urllib.request
import bs4 as bs
iarg = str(input('Query: '))
url = str('https://www.youtube.com/results?search_query={}'.format(arg))
req = urllib.request.Request(url, data=None, headers={'User-Agent':'Mozilla'})
src = urllib.request.urlopen(req)
soup = bs.BeautifulSoup(src, 'html.parser')
results = []
for elem in a = soup.find_all("a"):
link = elem.get(href)
results.append(link)
webbrowser.open(results[0])
Thanks in advance!
Related
I want to download videos from a website.
Here is my code.
Every time when i run this code, it returns blank file.
Here is live code: https://colab.research.google.com/drive/19NDLYHI2n9rG6KeBCiv9vKXdwb5JL9Nb?usp=sharing
from bs4 import BeautifulSoup
import requests
url = requests.get("https://www.mxtakatak.com/xt0.3a7ed6f84ded3c0f678638602b48bb1b840bea7edb3700d62cebcf7a400d4279/video/20000kCCF0")
page = url.content
soup = BeautifulSoup(page, "html.parser")
#print(soup.prettify())
result = soup.find_all('video', class_="video-player")
print(result)
using Regex
import requests
import re
response = requests.get("....../video/20000kCCF0")
videoId = '20000kCCF0'
videos = re.findall(r'https://[^"]+' + videoId + '[^"]+mp4', response.text)
print(videos)
You always get a blank return because soup.find_all() doesn't find anything.
Maybe you should check the url.content you receive by hand and then decide what to look for with find_all()
EDIT: After digging a bit around I found out how to get the content_url_orig:
from bs4 import BeautifulSoup
import requests
import json
url = requests.get("https://www.mxtakatak.com/xt0.3a7ed6f84ded3c0f678638602b48bb1b840bea7edb3700d62cebcf7a400d4279/video/20000kCCF0")
page = url.content
soup = BeautifulSoup(page, "html.parser")
result = str(soup.find_all('script')[1]) #looking for script tag inside the html-file
result = result.split('window._state = ')[1].split("</script>']")[0].split('\n')[0]
#separating the json from the whole script-string, digged around in the file to find out how to do it
result = json.loads(result)
#navigating in the json to get the video-url
entity = list(result['entities'].items())[0][1]
download_url = entity['content_url_orig']
print(download_url)
Funny sidenote: If I read the JSON correctly you can find all videos with download-URLs the creator uploaded :)
I would like to scrape this page with Python: https://statusinvest.com.br/acoes/proventos/ibovespa.
With this code:
import requests
from bs4 import BeautifulSoup as bs
URL = "https://statusinvest.com.br/acoes/proventos/ibovespa"
page = 1
req = requests.get(URL+str(page))
soup = bs(req.text, 'html.parser')
container = soup.find('div', attrs={'class','list'})
dividends = container.find('a')
for dividend in dividends:
links = dividend.find_all('a')
print(links)
But it doesn't return anything.
Can someone help me please?
Edited: you can see the below updated code to access any data you mentioned in the comment, you can modify according to your needs as all the data on that page is inside data variable.
Updated Code:
import json
import requests
from bs4 import BeautifulSoup as bs
url = "https://statusinvest.com.br"
links = []
req = requests.get(f"{url}/acoes/proventos/ibovespa")
soup = bs(req.content, 'html.parser')
data = json.loads(soup.find('input', attrs={'id': 'result'})["value"])
print("Date Com Data")
for datecom in data["dateCom"]:
print(f"{datecom['code']}\t{datecom['companyName']}\t{datecom['companyNameClean']}\t{datecom['companyId']}\t{datecom['companyId']}\t{datecom['resultAbsoluteValue']}\t{datecom['dateCom']}\t{datecom['paymentDividend']}\t{datecom['earningType']}\t{datecom['dy']}\t{datecom['recentEvents']}\t{datecom['recentEvents']}\t{datecom['uRLClear']}")
print("\nDate Payment Data")
for datePayment in data["datePayment"]:
print(f"{datePayment['code']}\t{datePayment['companyName']}\t{datePayment['companyNameClean']}\t{datePayment['companyId']}\t{datePayment['companyId']}\t{datePayment['resultAbsoluteValue']}\t{datePayment['dateCom']}\t{datePayment['paymentDividend']}\t{datePayment['earningType']}\t{datePayment['dy']}\t{datePayment['recentEvents']}\t{datePayment['recentEvents']}\t{datePayment['uRLClear']}")
print("\nProvisioned Data")
for provisioned in data["provisioned"]:
print(f"{provisioned['code']}\t{provisioned['companyName']}\t{provisioned['companyNameClean']}\t{provisioned['companyId']}\t{provisioned['companyId']}\t{provisioned['resultAbsoluteValue']}\t{provisioned['dateCom']}\t{provisioned['paymentDividend']}\t{provisioned['earningType']}\t{provisioned['dy']}\t{provisioned['recentEvents']}\t{provisioned['recentEvents']}\t{provisioned['uRLClear']}")
Seeing to the source code of that website one could fetch the json directly and get your desired links follow the below code.
Code:
import json
import requests
from bs4 import BeautifulSoup as bs
url = "https://statusinvest.com.br"
links=[]
req = requests.get(f"{url}/acoes/proventos/ibovespa")
soup = bs(req.content, 'html.parser')
data = json.loads(soup.find('input', attrs={'id': 'result'})["value"])
for datecom in data["dateCom"]:
links.append(f"{url}{datecom['uRLClear']}")
for datePayment in data["datePayment"]:
links.append(f"{url}{datePayment['uRLClear']}")
for provisioned in data["provisioned"]:
links.append(f"{url}{provisioned['uRLClear']}")
print(links)
Output:
Let me know if you have any questions :)
I am trying to find a downloadable video links in a website. For example, I am working with urls like these https://www.loc.gov/item/2015669100/. You can see that there is a m3u8 video link under mejs__mediaelement div tag.
However my code is not printing anything. Meaning that it's not finding the Video urls for the website.
My code is below
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
with open('pages2crawl.txt', 'r') as inFile:
lines = [line.rstrip() for line in inFile]
for page in lines:
req = Request(page, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(urlopen(req).read(), 'html.parser')
pages = soup.findAll('div', attrs={'class' : 'mejs__mediaelement'})
for e in pages:
video = e.find("video").get("src")
if video.endswith("m3u8"):
print(video)
If you just want to make a simple script it would probably be easier to use regex.
import re, requests
s = requests.Session() #start the session
data = s.get(url) #http get request to download data
data = data.text #get the raw text
vidlinks = re.findall("src='(.*?).m3u8'/>", data) #find all between the two parts in the data
print(vidlinks[0] + ".m3u8") #print the full link with extension
You can use CSS selector source[type="application/x-mpegURL"] to extract MPEG link (or source[type="video/mp4"] to extract mp4 link):
import requests
from bs4 import BeautifulSoup
url = "https://www.loc.gov/item/2015669100/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
link_mpeg = soup.select_one('source[type="application/x-mpegURL"]')["src"]
link_mp4 = soup.select_one('source[type="video/mp4"]')["src"]
print(link_mpeg)
print(link_mp4)
Prints:
https://tile.loc.gov/streaming-services/iiif/service:afc:afc2010039:afc2010039_crhp0001:afc2010039_crhp0001_mv04/full/full/0/full/default.m3u8
https://tile.loc.gov/storage-services/service/afc/afc2010039/afc2010039_crhp0001/afc2010039_crhp0001_mv04.mp4
I have the following code:
from bs4 import BeautifulSoup
import requests
import csv
url = "https://coingecko.com/en"
base_url = "https://coingecko.com"
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
names = [div.a.span.text for div in soup.find_all("div",attrs={"class":"coin-content center"})]
Link = [base_url+div.a["href"] for div in soup.find_all("div",attrs={"class":"coin-content center"})]
for link in Link:
inner_page = requests.get(link)
inner_soup = BeautifulSoup(inner_page.content,"html.parser")
indent = inner_soup.find("div",attrs={"class":"py-2"})
content = indent.div.next_siblings
Allcontent = [sibling for sibling in content if sibling.string is not None]
print(Allcontent)
I have successfully enter to innerpage and grabbed all coins' information from the first page listed coin. But there is next page as 1,2,3,4,5,6,7,8,9 etc. How can I go to all the next page and do the same as previously?
Further, the output of my code contains a lot of \n and space. How can I fix that.
You need to generate all the pages and requests one by one and parse using bs4
from bs4 import BeautifulSoup
import requests
req = requests.get('https://www.coingecko.com/en')
soup = BeautifulSoup(req.content, 'html.parser')
last_page = soup.select('ul.pagination li:nth-of-type(8) > a:nth-of-type(1)')[0]['href']
lp = last_page.split('=')[-1]
count = 0
for i in range(int(lp)):
count+=1
url = 'https://www.coingecko.com/en?page='+str(count)
print(url)
requests.get(url)#requests each page one by one till last page
##parse your fileds here using bs4
The way you have written your script has got a messy look. Try with .select() to make it concise and less prone to break. Although I could not find the further usage of names in your script, I kept it as it is. Here is how you can get all the available links traversing multiple pages.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
url = "https://coingecko.com/en"
while True:
page = requests.get(url)
soup = BeautifulSoup(page.text,"lxml")
names = [item.text for item in soup.select("span.d-lg-block")]
for link in [urljoin(url,item["href"]) for item in soup.select(".coin-content a")]:
inner_page = requests.get(link)
inner_soup = BeautifulSoup(inner_page.text,"lxml")
desc = [item.get_text(strip=True) for item in inner_soup.select(".py-2 p") if item.text]
print(desc)
try:
url = urljoin(url,soup.select_one(".pagination a[rel='next']")['href'])
except TypeError:break
Btw, whitespaces have also been taken care of by using .get_text(strip=True)
I'm trying to get the href link for a movie (ex: search Iron Man on IMDB) but I can't seem to get it. I keep getting "None" when I run the code but if I remove .get('href'), the code will return the entire line of html (including the link I want). I appreciate any help with this. Thanks!
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin # For joining next page url with base url
search_terms = input("What movie do you want to know about?\n> ").split()
url = "http://www.imdb.com/find?ref_=nv_sr_fn&q=" + '+'.join(search_terms) + '&s=all'
def scrape_find_next_page(url):
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
next_page = soup.find('td', 'result_text').get('href')
return next_page
next_page_url = scrape_find_next_page(url)
You are trying to get the href from td, which the attribute does not exist. You need to get the a tag that contains the href attribute
next_page = soup.find('td', 'result_text').find('a').get('href')