how to get resolutions with file extention in pytube - python

from pytube import YouTube
link = "https://www.youtube.com/watch?v=LXb3EKWsInQ"
yt = YouTube(link)
resolutions = []
video_resolutions = []
video_res = []
for stream in yt.streams.filter(subtype="mp4").order_by("resolution"):
video_resolutions.append(stream.resolution)
print(video_resolutions)
result
['144p', '144p', '144p', '240p', '240p', '240p', '360p', '360p', '360p', '360p', '480p', '480p', '480p', '720p', '720p', '720p', '720p', '1080p', '1080p', '1080p', '1440p', '1440p', '2160p', '2160p']
i am trying to build a yt video Downloader but i am stuck at getting resolution , because the streams list is repetitive

Related

How to crawl images inside links of a page

I need the crawler to go to the links inside a website and scan images there. I've managed to get this far but I'm confused.
I'm trying to do something like this but I'm sure there's gonna be an easier way.
from bs4 import *
import requests as rq
import os
import sys
from urllib.parse import urlparse
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
images = []
link_urls = soup2.select('a')
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
print(img['src'])
return img['src']
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
# print(link)
if url_validator(link):
crawl_images(link)
I try python3 new_crawler.py imdb.com 3 which should print sources of images crawled in 3 links inside imdb.com but it's not printing anything.
You want to crawl through the images, correct? Try this:
from bs4 import BeautifulSoup
import requests as rq
URL = ""
source = rq.get(URL)
soup = BeautifulSoup(source.text, "html.parser")
image_links = soup.find_all("img")
for img in image_links:
print(img['src'])
Add the website's url to the constant URL that you are trying to scrap. The page's img tags should all be saved in the image_links variable.
This is what I ended up with. It's not working how it's supposed to but the time for the task is up and I decided to share anyway.
from bs4 import *
import requests as rq
import sys
from urllib.parse import urlparse
import json
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
link_urls = soup2.select('a')
links = []
images_sources = []
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
images_sources.append(img['src'])
results = {
"imageUrl": img['src'],
"sourceUrl": link,
"depth": depth
}
json_object = json.dumps(results)
with open("results.json", "w") as f:
f.write(json_object)
f.close()
return results
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
if url_validator(link):
crawl_images(link)

low qualities of image from google result

this python code download a image from a Google search result, its wrok fine but all of the images are low qualities
how download them with more quality
also sorry for my bad names of vals
import requests
import bs4
ss = "hello world"
sss = requests.utils.quote(ss)
print(sss)
jomle = str(sss)
utl = r"https://www.google.com/search?q="+jomle
req = requests.get(utl)
print("koobs")
soup = bs4.BeautifulSoup(req.text, "html.parser")
els = soup.find_all("img")
somethin = els[2]["src"]
print(somethin)

Downloading video from URL using python results in file with 0 byte

I'm currently trying to do some web scraping from this website: https://likee.video/hashtag/CuteHeadChallenge?lang=en
I managed to extract video link by using selenium, beautifulsoup and json here are the following code:
#click the element
element.click()
time.sleep(3)
page = wd.page_source
souptest = BeautifulSoup(page,'html.parser')
# code to get vid from clicked element here
data = json.loads(souptest.find('script', {"id" : 'videoObject', "type" : 'application/ld+json'}).text , strict=False)
folderpath = '/content/drive/MyDrive/LikeeData/' + data["author"]["name"]
video = data["contentUrl"]
Problem is when I tried to download the url, the downloaded file will have the size of 0 byte, here is what I tried:
urllib.request.urlretrieve(video, folderpath + '/vid.mp4')
req_file = requests.get(video)
with open(folderpath + '/vid.mp4', "wb") as file:
file.write(req_file .content)
Then I noticed that the the get request returned response 204 so I tried some solutions from another thread
r = requests.get(data["contentUrl"], stream=True)
while r.status_code == 204:
time.sleep(1)
print('still 204')
r = requests.get(data["contentUrl"])
but it did not work and always return 204, when I open the link in my browser it returns response 200, when I change the url to the thumbnail it worked, it did not work only when I tried to download video url, here is one such video url: https://video.like.video/asia_live/2s2/2Dz9d6_4.mp4?crc=1506960817&type=5
Please help me find out what is wrong here, thank you for you assistance
You can try this example how to download all videos on the page (without a selenium):
import re
import json
import requests
url = "https://likee.video/hashtag/CuteHeadChallenge?lang=en"
api_url = "https://api.like-video.com/likee-activity-flow-micro/videoApi/getEventVideo"
payload = {
"country": "US",
"page": 1,
"pageSize": 28,
"topicId": "",
}
html_doc = requests.get(url).text
data = re.search(r"window\.data = ({.*});", html_doc)[1]
data = json.loads(data)
payload["topicId"] = data["topicId"]
data = requests.post(api_url, json=payload).json()
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
# print/save each videoUrl:
for i, video in enumerate(data["data"]["videoList"], 1):
print("Downloading {} as {}.mp4".format(video["videoUrl"], i))
# download video
with open("{}.mp4".format(i), "wb") as f_out:
f_out.write(requests.get(video["videoUrl"]).content)
Prints:
Downloading https://video.like.video/asia_live/2s2/2Dz9d6_4.mp4?crc=1506960817&type=5 as 1.mp4
Downloading https://video.like.video/asia_live/2s1/2gaWZB_4.mp4?crc=1571964795&type=5 as 2.mp4
Downloading https://video.like.video/asia_live/2s1/2RLMdC_4.mp4?crc=779823808&type=5 as 3.mp4
...
and saves the videos.

RegistryError: Cant't find 'spacy.Tokenizer.v1' in registry spacy -> tokenizers. Available names: none

I am trying to:
Tokenize CV from a pdf
This is what I have done:
pages = []
nlp = spacy.load("fr_core_news_sm")
test = "C:\\Users\\admin\\OneDrive\\Bureau\\Stage\\file.pdf"
def return_token(p):
read_pdf = PyPDF2.PdfFileReader(test)
number_of_pages = read_pdf.getNumPages()
for page_number in range(number_of_pages):
page = read_pdf.getPage(page_number)
p = page.extractText().replace('\n','')
pages.append(page)`enter code here`
doc = nlp(p)
return [X.text for X in doc]
return_token(p)
Error:
RegistryError: Cant't find 'spacy.Tokenizer.v1' in registry spacy -> tokenizers. Available names: none
This error came to me as well and it was resolved via restarting the kernel.

raise RuntimeError('FPDF error: '+msg) RuntimeError: FPDF error: Unsupported image type: chapter_1_romance_dawn

Okay, so I am working on a manga (japanese comics) downloader. Japanese Comics are available online but you can only read them, if you wish to download them, you have to start saving image files by right clicking blah blah blah...
So, I was working on an alternative manga downloader that will download all the chapters of the manga as specified by you and then convert them to pdf.
I have completed the code for downloading the images and its working quite well, but the problem is in the pdf-conversion part.
here's my code
import requests
import urllib
import glob
from bs4 import BeautifulSoup
import os
from fpdf import FPDF
def download_image(url, path):
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
start_chapter = int(input("Enter Starting Chapter: "))
end_chapter = int(input("Enter Ending Chapter: "))
chapters = range(start_chapter, end_chapter + 1)
chapter_list = []
for chapter in chapters:
chapter_list.append("https://manganelo.com/chapter/read_one_piece_manga_online_free4/chapter_" + str(chapter))
for URL in chapter_list:
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img')
for i in images:
url = i.attrs["src"]
os.makedirs(url.split('/')[-2], exist_ok=True)
download_image(url, os.path.join(url.split('/')[-2], url.split('/')[-1]))
pdf = FPDF()
imageList = glob.glob("*")
for image in imageList:
pdf.add_page()
pdf.image(image, 10, 10, 200, 300)
pdf.output("One Piece Chapter", "F")
So, any suggestions how i can fix this error:
raise RuntimeError('FPDF error: '+msg) RuntimeError: FPDF error: Unsupported image type: chapter_1_romance_dawn
First of all this is a very nice idea.
The error will occurs because the image list path is wrong.
You are storing the jpgs in the folder (chaptername).
Everything you have to do is give the correct path to FPDF.
I created a set to avoid duplications.
Then i removed the "images" and "icon" folder -> maybe you will use them ?
cchapter = set()
for URL in chapter_list:
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img')
for i in images:
url = i.attrs["src"]
cchapter.add(url.split('/')[-2])
os.makedirs(url.split('/')[-2], exist_ok=True)
download_image(url, os.path.join(url.split('/')[-2], url.split('/')[-1]))
cchapter.remove('images')
cchapter.remove('icons')
chapterlist = list(cchapter)
print(chapterlist[0])
def sortKeyFunc(s):
return int(os.path.basename(s)[:-4])
for chap in chapterlist:
pdf = FPDF()
imageList = glob.glob(chap + "/*.jpg")
imageList.sort(key=sortKeyFunc)
for image in imageList:
pdf.add_page()
pdf.image(image, 10, 10, 200, 300)
pdf.output(chap + ".pdf", "F")
Finally i added a loop to create a pdf for each single folder...
Then naming the PDF to the chapters name...
You also miss in your ourput the extension (".pdf")...
This will work. :)
EDIT:
glob.glob will return the filelist not in correct order.
Reference: here
It is probably not sorted at all and uses the order at which entries
appear in the filesystem, i.e. the one you get when using ls -U. (At
least on my machine this produces the same order as listing glob
matches).
Therefor you can use the filename (in our case given as a number) as a sortkey.
def sortKeyFunc(s):
return int(os.path.basename(s)[:-4])
then add imageList.sort(key=sortKeyFunc) in the loop.
NOTE: Code is updated.

Categories