BeautifulSoup: Missing Schema invalid url error - python

I am trying to download images from a web page using BeautifulSoup. I am getting the following error
MissingSchema: Invalid URL
import requests
from bs4 import BeautifulSoup
import os
from os.path import basename
url = "https://xxxxxx"
#r = requests.get(url)
request_page = urlopen(url)
page_html = request_page.read()
request_page.close()
soup = BeautifulSoup(page_html, 'html.parser')
#print(soup.title.text)
images = soup.find_all('img')
for image in images:
name = image['alt']
link =image['src']
with open(name.replace(' ', '-').replace('/', '') + 'jpg', 'wb') as f:
im = requests.get(link)
f.write(im.content)
print(images)
I am unsure why. I know I can read the images fine because the print works fine until I aded the following code
with open(name.replace(' ', '-').replace('/', '') + 'jpg', 'wb') as f:
im = requests.get(link)
f.write(im.content)
I would be grateful for any help
thanks
EDIT
The url is
url = "https://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/September_2018"
I added the print link as requested and the output is below
//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg/300px-Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg
//upload.wikimedia.org/wikipedia/commons/thumb/c/c5/Titian_-_Portrait_of_a_man_with_a_quilted_sleeve.jpg/280px-Titian_-_Portrait_of_a_man_with_a_quilted_sleeve.jpg
//upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Bee_on_Lavender_Blossom_2.jpg/250px-Bee_on_Lavender_Blossom_2.jpg
edit
I am just wondering if it the size of the name in the link? On looking at that it seems to be buried in a lot of folders before we get to the jpeg?

As I suspected based on the error, when you added that print statement you can see that the links you are trying to access are not valid urls.
//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg/300px-Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg needs to start with https:.
To fix this, simply add that to the image['src'].
Second issue you need to fix is that when you write the file, you are writing it as 'Natalya-Naryshkinajpg'. You need that with jpg as the file extesions: for example 'Natalya-Naryshkina.jpg' I fixed that as well.
Code:
import requests
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/September_2019"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
r = requests.get(url, headers=headers)
page_html = r.text
soup = BeautifulSoup(page_html, 'html.parser')
#print(soup.title.text)
images = soup.find_all('img')
for image in images:
name = image['alt']
link = 'https:' + image['src']
#print(link)
if 'static' not in link:
try:
extension = link.split('.')[-1]
with open(name.replace(' ', '-').replace('/', '') + '.' + extension, 'wb') as f:
im = requests.get(link, headers=headers)
f.write(im.content)
print(name)
except Exception as e:
print(e)
print(images)

This should hopefully work:
import re
import requests
from bs4 import BeautifulSoup
site = 'https://books.toscrape.com/'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)

Related

Problems with getting data from a page using python, beautiful soup

I am trying to explore the web scraping in python.Currently working with beautiful soup.I was trying to get names of the festivals from this site : https://www.skiddle.com/festivals .Everything was going pretty fine, except 1 page, this one: https://www.skiddle.com/festivals/front-end-data-test/. It says 'NoneType' object has no attribute 'find' any way i can get data from there?
Here is the code
import requests
from bs4 import BeautifulSoup
import lxml
import json
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64"
}
#collect all fests URLs
fests_urls_list = []
#for i in range(0, 120, 24):
for i in range(0, 24, 24):
url = f"https://www.skiddle.com/festivals/search/?ajaxing=1&sort=0&fest_name=&from_date=15%20Aug%202022&to_date=&maxprice=500&o={i}&bannertitle=August"
req = requests.get(url=url, headers=headers)
json_data = json.loads(req.text)
html_response = json_data["html"]
with open(f"data/index_{i}.html", "w", encoding="utf-8") as file:
file.write(html_response)
with open(f"data/index_{i}.html", "r", encoding="utf-8") as file:
src = file.read()
soup = BeautifulSoup(src, "lxml")
cards = soup.find_all("a", class_="card-details-link")
for item in cards:
fest_url = "https://www.skiddle.com" + item.get("href")
fests_urls_list.append(fest_url)
#collect fest info
for url in fests_urls_list:
req = requests.get(url=url, headers=headers)
try:
soup = BeautifulSoup(req.text, "lxml")
fest_name = soup.find("div", class_="MuiContainer-root MuiContainer-maxWidthFalse css-1krljt2").find("h1").text.strip()
fest_data = soup.find("div", class_="MuiGrid-root MuiGrid-item MuiGrid-grid-xs-11 css-twt0ol").text.strip()
print(fest_data)
except Exception as ex :
print(ex)
print("This was not supposed to happen")

Query google images with copy right free filter using requests/urllib

I am trying to query images with copy right free filter. Even though the url leads to the right settings in my code, for some reason the page that is read by both urllib and requests is the first few images without the copy right free and size filter. If anyone can help with this I would greatly appreciate it.
code:
#%%
import requests
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
url = 'https://google.com/search?q='
input = 'cat'
#string: tbm=isch --> means image search
#tbs=isz:m --> size medium
#il:cl --> copy right free(i think)
url = url+input+'&tbm=isch&tbs=isz:m%2Cil:cl'
print(url)
html = urlopen(Request(url, headers={'User-Agent': 'Google Chrome'}))
'''with urllib.request.urlopen(url) as response:
html = response.read()
print(html)'''
#print(str(r.content))
soup = BeautifulSoup(html.read(),'html.parser')
#print(soup.prettify)
#using soup to find all img tags
results = soup.find_all('img')
str_result = str(results)
print(str_result)
lst_result = str_result.split(',')
#trying to get the very first link for the images with the appropriate settings
link = lst_result[4].split(' ')[4].split('"')[1]
#print(link)
# writing into the appropriate testing file, to be changed
file = open('.img1.png','wb')
get_img = requests.get(link)
file.write(get_img.content)
file.close()
import requests
import re, json
extentions = ['jpg', 'jpeg', 'png', 'gif', 'svg']
# determine image extention (not guaranteed, some links lack the extension)
def extention(url):
# or use the "imghdr" package to determine the extention
for ext in extentions:
if url.endswith(f'.{ext}'):
return ext
return '.UNKNOWN'
URL = 'https://google.com/search'
params = {
'q': 'cat', # search term
'tbm': 'isch',
'tbs': 'isz:m,il:cl'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.3'
}
r = requests.get(URL, params=params, headers=headers)
html = r.text
matches = re.findall('data:\[(?!\])(.*?)],\ sideChannel', html) # the data lives in a script, not in <img> elements (initially)
data = json.loads(f'[{matches[1]}]')
images = []
for image in data[31][0][12][2]: # the data structure may change some day, but its consistent between requests as of now
if type(image) is list:
try:
images.append(image[1][3][0])
except:
pass
images = list(dict.fromkeys(images)) # remove duplicate links
# retrive and save the first image's data
print(images[0])
imgdata = requests.get(images[0], headers=headers).content
with open(f'img.{extention(images[0])}', 'wb') as file:
file.write(imgdata)

Extract .jpg from HTML source code with Python

I set up this code to extract the links from the following website. The problem is that it breaks into register 19 and doesn't continue with the listing.
You can help me.
import urllib.request
import os
tematica = 'fun'
url = "https://www.shutterstock.com/es/search/" + tematica
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
data_content = response.read()
Html_file= open("html_file.html","wb")
Html_file.write(data_content)
Html_file.close()
html=codecs.open("html_file.html", 'r', 'utf-8').read()
soup = BeautifulSoup(html)
for i,img_element in enumerate(soup.findAll('img', None)):
try:
img_src = img_element['src']
print(i,img_src)
except:
pass

How do I get latest pdf file by given code using python

I tried the following code to download all pdf file from the links but with that It download all files when I run these code every time. Recommended: First time it should download all pdf, and from next time it should download only which one is new.(it should check first which one is new)
My Code:
import requests
from bs4 import BeautifulSoup
root_url = 'https://www.iea.org'
def getLinks(url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
yearLinks = getLinks(root_url +'/oilmarketreport/reports/')
# get report URL
reportLinks = []
for url in yearLinks:
links = getLinks(url)
#reportLinks.extend(links)
#print(reportLinks)
i =0
for url_ in links:
if "AnnualStatisticalSupplement" not in url_:
url__ = url_.replace("org..", "org").replace("../", "")
response = requests.get(url__, stream=True)
lastindex= url__.rfind('/')
strlen = len(url__)
filename = url__[lastindex:strlen]
with open('/home/pdfs/'+ str(filename), 'wb') as pdffile:
pdffile.write(response.content)
i += 1
print(url__)
print("Download Completed")
Then I need to store that file is Mongo DB, How should i do that by making three column(pdf name, reported date, flag of process).
Sorry for the significant change in your code. because your code is too messy to read.
if you want to download the pdf you don't have since some time, you must add if-loop to control your action. by the way if you add page url into your database that you need not to access one more time to get the pdf name.
import requests
from bs4 import BeautifulSoup
root_url = 'https://www.iea.org'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
downloaded = ["2018-02-13.pdf"] # the latest i have
def getLinks(url):
page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.text, 'lxml')
li = soup.find_all("li",class_="omrlist")
links = [root_url + href.a.get('href') for href in li]
return links
def get_pdf(url,flag=1):
# find page link in the month directory
pdf_page = requests.get(url,headers=headers)
soup = BeautifulSoup(pdf_page.text, 'lxml')
li = soup.find_all("li",class_="omrlist")[::-1] # latest -> old
latest_pdf_set = [root_url + href.a.get('href') for href in li]
# find pdf link
pdf_links = []
for pdf_url in latest_pdf_set:
text = requests.get(pdf_url,headers=headers).text
soup = BeautifulSoup(text,"lxml")
link = soup.find("div",class_="omrreport pL10").find("a").get("href")
if link.split("/")[-1] in downloaded:
flag = 0 # if flag = 0 means you found the pdf that you already had
break
pdf_links.append(root_url + link)
return pdf_links,flag
yearLinks = getLinks(root_url +'/oilmarketreport/reports/')
all_ = []
for each in yearLinks:
pdf_links = get_pdf(each)
all_ += pdf_links[0]
if not pdf_links[1]:
# flag = 0 break
break
print(all_)

Unable to save image from web using urllib2

I want to save some images from a website using python urllib2 but when I run the code it saves something else.
This is my code:
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
url = "http://m.jaaar.com/"
r = urllib2.Request(url, headers=headers)
page = urllib2.urlopen(r).read()
soup = BeautifulSoup(page)
imgTags = soup.findAll('img')
imgTags = imgTags[1:]
for imgTag in imgTags:
imgUrl = "http://www.jaaar.com" + imgTag['src']
imgUrl = imgUrl[0:-10] + imgUrl[-4:]
fileName = "khabarnak-" + imgUrl[-12:]
print fileName
imgData = urllib2.urlopen(imgUrl).read()
print imgUrl
output = open("C:\wamp\www\py\pishkhan\\" + fileName,'wb')
output.write(imgData)
output.close()
Any suggestions?
The site is returning a standard image back to you because you are scraping the site. Use the same 'trick' of setting the headers when retrieving the image:
imgRequest = urllib2.Request(imgUrl, headers=headers)
imgData = urllib2.urlopen(imgRequest).read()

Categories