Downloading video from URL using python results in file with 0 byte - python

I'm currently trying to do some web scraping from this website: https://likee.video/hashtag/CuteHeadChallenge?lang=en
I managed to extract video link by using selenium, beautifulsoup and json here are the following code:
#click the element
element.click()
time.sleep(3)
page = wd.page_source
souptest = BeautifulSoup(page,'html.parser')
# code to get vid from clicked element here
data = json.loads(souptest.find('script', {"id" : 'videoObject', "type" : 'application/ld+json'}).text , strict=False)
folderpath = '/content/drive/MyDrive/LikeeData/' + data["author"]["name"]
video = data["contentUrl"]
Problem is when I tried to download the url, the downloaded file will have the size of 0 byte, here is what I tried:
urllib.request.urlretrieve(video, folderpath + '/vid.mp4')
req_file = requests.get(video)
with open(folderpath + '/vid.mp4', "wb") as file:
file.write(req_file .content)
Then I noticed that the the get request returned response 204 so I tried some solutions from another thread
r = requests.get(data["contentUrl"], stream=True)
while r.status_code == 204:
time.sleep(1)
print('still 204')
r = requests.get(data["contentUrl"])
but it did not work and always return 204, when I open the link in my browser it returns response 200, when I change the url to the thumbnail it worked, it did not work only when I tried to download video url, here is one such video url: https://video.like.video/asia_live/2s2/2Dz9d6_4.mp4?crc=1506960817&type=5
Please help me find out what is wrong here, thank you for you assistance

You can try this example how to download all videos on the page (without a selenium):
import re
import json
import requests
url = "https://likee.video/hashtag/CuteHeadChallenge?lang=en"
api_url = "https://api.like-video.com/likee-activity-flow-micro/videoApi/getEventVideo"
payload = {
"country": "US",
"page": 1,
"pageSize": 28,
"topicId": "",
}
html_doc = requests.get(url).text
data = re.search(r"window\.data = ({.*});", html_doc)[1]
data = json.loads(data)
payload["topicId"] = data["topicId"]
data = requests.post(api_url, json=payload).json()
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
# print/save each videoUrl:
for i, video in enumerate(data["data"]["videoList"], 1):
print("Downloading {} as {}.mp4".format(video["videoUrl"], i))
# download video
with open("{}.mp4".format(i), "wb") as f_out:
f_out.write(requests.get(video["videoUrl"]).content)
Prints:
Downloading https://video.like.video/asia_live/2s2/2Dz9d6_4.mp4?crc=1506960817&type=5 as 1.mp4
Downloading https://video.like.video/asia_live/2s1/2gaWZB_4.mp4?crc=1571964795&type=5 as 2.mp4
Downloading https://video.like.video/asia_live/2s1/2RLMdC_4.mp4?crc=779823808&type=5 as 3.mp4
...
and saves the videos.

Related

problem when download a image from website using beautiful soap

i am wrtiting this code to get information about top movies and also download the image blong to the movie but on some image they downloaded but their size are 0 but they have size on disk when i kilick on the link of the image that i cant download it well its opening well and there is no problem in link
for exampele this is one of the link that images :
https://static.stacker.com/s3fs-public/styles/slide_desktop/s3/00000116_4_0.png
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = "https://stacker.com/stories/1587/100-best-movies-all-time"
count = 0
local_description = ""
movie_data = []
data = requests.get(URL).text
soap = BeautifulSoup(data, "html.parser")
titles = soap.find_all(name="h2", class_="ct-slideshow__slide__text-container__caption")[1:]
description = soap.find_all(name="div", class_="ct-slideshow__slide__text-container__description")[1:]
images = soap.find_all(name="img", typeof="foaf:Image")[6:106]
for num in range(100):
movie_name = titles[num].getText().replace("\n", "")
local_des = description[num].find_all(name="p")[1:]
for s in local_des:
local_description = s.getText().replace(" ", "")
local_data = {"title": movie_name, "description": local_description}
movie_data.append(local_data)
movie_image_link = images[num].get("src")
response = requests.get(movie_image_link)
with open(f"images/{movie_name}.png", 'wb') as f:
f.write(response.content)
count += 1
print(count)
data_collected = pd.DataFrame(movie_data)
data_collected.to_csv("Data/100_movie.csv", index=False)
i found my problem in some movie name there was ":" and as you knwo you cant user ":"
in file names i fix the code with .replace()
movie_name.replace(":", "")
Once you get a response, check if it's empty before writing to disk. Might need to retry or the link may be bad.

Webscraping pdfs in Python in multiple links

I am trying to webscrape this website. To do so, I wrote the following code which works nicely:
from bs4 import BeautifulSoup
import pandas as pd
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{card.a.get('href')}").content)
data.append({
'date': card.select_one('.item_date').get_text(strip=True),
'title': card.select_one('.title a').get_text(strip=True),
'author': card.select_one('.authorlnk.dashed').get_text(strip=True),
'url': f"https://www.bis.org{card.a.get('href')}",
'text': r.select_one('#cmsContent').get_text('\n\n', strip=True)
})
pd.DataFrame(data)
However, if you for example open the first link of the page, there is a pdf in it. I would like to add to my dataframe - whenever there is a pdf in the link - the content of the pdf.
To do so, I have been looking around and I tried the following only on the first pdf of the first link:
import io
from PyPDF2 import PdfFileReader
def info(pdf_path):
response = requests.get(pdf_path)
with io.BytesIO(response.content) as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
txt = f"""
Information about {pdf_path}:
Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}
"""
print(txt)
return information
info('https://www.bis.org/review/r220708e.pdf')
However, it just gets the info (which I already have from the previous code), while it is missing the text. Ideally, I would like it to be part of the same code as above. I got stuck here.
Can anyone help me with this?
Thanks!
You need to return it, e.g. as a tuple :
return txt, information
If you want the text inside the pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() + "\n"
I'll allow you the pleasure of adapting this to your requests, sync scraping fashion (really not hard):
from PyPDF2 import PdfReader
...
async def get_full_content(url):
async with AsyncClient(headers=headers, timeout=60.0, follow_redirects=True) as client:
if url[-3:] == 'pdf':
r = await client.get(url)
with open(f'{url.split("/")[-1]}', 'wb') as f:
f.write(r.content)
reader = PdfReader(f'{url.split("/")[-1]}')
pdf_text = ''
number_of_pages = len(reader.pages)
for x in range(number_of_pages):
page = reader.pages[x]
text = page.extract_text()
pdf_text = pdf_text + text
And then you do something with the pdf_text extracted from .pdf (saving it into a db, reading it with pandas, nlp-ing it with Transformers/torch, etc).
Edit: one more thing: do a pip install -U pypdf2 as the package was recently updated (a few hours ago), just to make sure you're up to date.
Edit 2: A copy/pastable example, for a single .pdf file:
from PyPDF2 import PdfReader
import requests
url = 'https://www.bis.org/review/r220708e.pdf'
r = requests.get(url)
with open(f'{url.split("/")[-1]}', 'wb') as f:
f.write(r.content)
reader = PdfReader(f'{url.split("/")[-1]}')
pdf_text = ''
number_of_pages = len(reader.pages)
for x in range(number_of_pages):
page = reader.pages[x]
text = page.extract_text()
pdf_text = pdf_text + text
print(pdf_text)

Scraping using BeautifulSoup only gets me 33 responses off of an infinite scrolling page. How do i increase the number of responses?

The website link:
https://collegedunia.com/management/human-resources-management-colleges
The code:
import requests
from bs4 import BeautifulSoup
r = requests.get("https://collegedunia.com/management/human-resources-management-colleges")
c = r.content
soup = BeautifulSoup(c,"html.parser")
all = soup.find_all("div",{"class":"jsx-765939686 col-4 mb-4 automate_client_img_snippet"})
l = []
for divParent in all:
item = divParent.find("div",{"class":"jsx-765939686 listing-block text-uppercase bg-white position-relative"})
d = {}
d["Name"] = item.find("div",{"class":"jsx-765939686 top-block position-relative overflow-hidden"}).find("div",{"class":"jsx-765939686 clg-name-address"}).find("h3").text
d["Rating"] = item.find("div",{"class":"jsx-765939686 bottom-block w-100 position-relative"}).find("ul").find_all("li")[-1].find("a").find("span").text
d["Location"] = item.find("div",{"class":"jsx-765939686 clg-head d-flex"}).find("span").find("span",{"class":"mr-1"}).text
l.append(d)
import pandas
df = pandas.DataFrame(l)
df.to_excel("Output.xlsx")
The page keeps adding colleges as you scroll down, i dont know if i could get all the data, but is there a way to atleast increase the number of responses i get. There are a total of 2506 entries, as can be seen on the website?
Seeing to your Question we can see it in the network requests data is being fetched from the ajax request and they are using base64 encoded params to fetch the data you can follow the below code to get the data and parse it in your desire format.
Code:
import json
import pandas
import requests
import base64
collegedata = []
count = 0
while True:
datadict = {"url": "management/human-resources-management-colleges", "stream": "13", "sub_stream_id": "607",
"page": count}
data = base64.urlsafe_b64encode(json.dumps(datadict).encode()).decode()
params = {
"data": data
}
response = requests.get('https://collegedunia.com/web-api/listing', params=params).json()
if response["hasNext"]:
for i in response["colleges"]:
d = {}
d["Name"] = i["college_name"]
d["Rating"] = i["rating"]
d["Location"] = i["college_city"] + ", " + i["state"]
collegedata.append(d)
print(d)
else:
break
count += 1
df = pandas.DataFrame(collegedata)
df.to_excel("Output.xlsx", index=False)
Output:
Let me know if you have any questions :)
When you analyse the website via the network tab on chrome, you can see the website makes xhr calls in the back.
The endpoint to which it sends a http get request is as follows:
https://collegedunia.com/web-api/listing?data=eyJ1cmwiOiJtYW5hZ2VtZW50L2h1bWFuLXJlc291cmNlcy1tYW5hZ2VtZW50LWNvbGxlZ2VzIiwic3RyZWFtIjoiMTMiLCJzdWJfc3RyZWFtX2lkIjoiNjA3IiwicGFnZSI6M30=
When you send a get via requests module, you get a json response back.
import requests
url = "https://collegedunia.com/web-api/listing?data=eyJ1cmwiOiJtYW5hZ2VtZW50L2h1bWFuLXJlc291cmNlcy1tYW5hZ2VtZW50LWNvbGxlZ2VzIiwic3RyZWFtIjoiMTMiLCJzdWJfc3RyZWFtX2lkIjoiNjA3IiwicGFnZSI6M30="
res = requests.get(url)
print(res.json())
But you need all the data, not only for page 1. The data sent in the request is base64 encoded i.e if you decode the data parameter of the get request, you can see the following
{"url":"management/human-resources-management-colleges","stream":"13","sub_stream_id":"607","page":3}
Now, change the page number, sub_stream_id, steam etc. accordingly and get the complete data from the website.

How to make this program use instagram pic urls and download? [duplicate]

This question already has answers here:
Download large file in python with requests
(8 answers)
Closed 2 years ago.
The goal is for the program to take user given instagram url and allow to download and save a picture.
I've got the main part in place but cant understand how I can go further and use the filtered and right url to download and save the picture on my computer.
My code so far:
EDIT: I added a download line but can't seem to get the right file type? I mean it saves as whatever I want it to but I can't open it:
import requests
import re
import shutil
def get_response(url):
r = requests.get(url)
while r.status_code != 200:
r.raw.decode_content = True
r = requests.get(url, stream = True)
return r.text
def prepare_urls(matches):
return list({match.replace("\\u0026", "&") for match in matches})
url = input('Enter Instagram URL: ')
response = get_response(url)
vid_matches = re.findall('"video_url":"([^"]+)"', response)
pic_matches = re.findall('"display_url":"([^"]+)"', response)
vid_urls = prepare_urls(vid_matches)
pic_urls = prepare_urls(pic_matches)
if vid_urls:
print('Detected Videos:\n{0}'.format('\n'.join(vid_urls)))
print("Can't download video, the provided URL must be of a picture.")
if pic_urls:
print('Detected Pictures:\n{0}'.format('\n'.join(pic_urls)))
from urllib.request import urlretrieve
dst = 'Instagram picture.jpg'
urlretrieve(url, dst)
#EDIT ^
if not (vid_urls or pic_urls):
print('Could not recognize the media in the provided URL.')
I think this might help...
import requests
from bs4 import BeautifulSoup as bs
import json
import os.path
insta_url = 'https://www.instagram.com'
inta_username = input('enter username of instagram : ')
response = requests.get(f"{insta_url}/{inta_username}/")
if response.ok:
html = response.text
bs_html = bs(html, features="lxml")
bs_html = bs_html.text
index = bs_html.find('profile_pic_url_hd')+21
remaining_text = bs_html[index:]
remaining_text_index = remaining_text.find('requested_by_viewer')-3
string_url = remaining_text[:remaining_text_index].replace("\\u0026", "&")
print(string_url, "\ndownloading...")
while True:
filename = 'pic_ins.jpg'
file_exists = os.path.isfile(filename)
if not file_exists:
with open(filename, 'wb+') as handle:
response = requests.get(string_url, stream=True)
if not response.ok:
print(response)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
else:
continue
break
print("completed")
You can change the name of the image downloaded by changing the filename variable

problems getting links from youtube channel with beautifulsoup

I am trying to scrape a youtube channel and return all of the links for each video of this channel, however when I try to print out these links, I only get a few links that have nothing to do with the videos. I am suspecting the videos may be loaded by Javascript, so would there we a way to even do this with beautifulsoup? Will I have to use selenium? Can somebody please help me and do some testing. Here is my code so far:
import requests
from bs4 import BeautifulSoup
print('scanning page...')
youtuber = 'memeulous'
result = requests.get('https://www.youtube.com/c/' + youtuber + '/videos')
status = result.status_code
src = result.content
soup = BeautifulSoup(src, 'lxml')
links = soup.find_all('a')
if status == 200:
print('valid URL, grabbing uploads...')
else:
print('invalid URL, status code: ' + str(status))
quit()
print(links)
and here is my output:
scanning page...
valid URL, grabbing uploads...
[About, Press, Copyright, Contact us, Creators, Advertise, Developers, Terms, Privacy, Policy and Safety, How YouTube works, Test new features]
[Finished in 4.0s]
as you can see, no video links.
One way of doing this would be with the following code:
import requests
api_key = "PASTE_YOUR_API_KEY_HERE!"
yt_user = "memeulous"
api_url = f"https://www.googleapis.com/youtube/v3/channels?part=contentDetails&forUsername={yt_user}&key={api_key}"
response = requests.get(api_url).json()
playlist_id = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
channel_url = f"https://www.googleapis.com/youtube/v3/playlistItems?" \
f"part=snippet%2CcontentDetails&maxResults=50&playlistId={playlist_id}&key={api_key}"
def get_video_ids(vid_data: dict) -> list:
return [_id["contentDetails"]["videoId"] for _id in vid_data["items"]]
def build_links(vid_ids: list) -> list:
return [f"https://www.youtube.com/watch?v={_id}" for _id in vid_ids]
def get_all_links() -> list:
all_links = []
url = channel_url
while True:
res = requests.get(url).json()
all_links.extend(build_links(get_video_ids(res)))
try:
paging_token = res["nextPageToken"]
url = f"{channel_url}&pageToken={paging_token}"
except KeyError:
break
return all_links
print(get_all_links())
This gets you all the video links (469) for the memeulous user.
['https://www.youtube.com/watch?v=4L8_isnyGfg', 'https://www.youtube.com/watch?v=ogpaiD2e-ss', 'https://www.youtube.com/watch?v=oH-nJe9XMN0', 'https://www.youtube.com/watch?v=kUcbKl4qe5g', ...
You can get the total video count from the videos_data object likes this:
print(f"Total videos: {videos_data['pageInfo']['totalResults']}")
I hope this helps and will get you started. All you need to do, is get the API key for the YouTube Data API.

Categories