My scraper throws error instead of downloading images - python

I've made a scraper to download images from a site. However, when i run this, it throws error showing: [raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403]. I used this method on other sites as well to scrape images but faced no issues. I can't figure out Why this error shows up and what is the workaround. Hope someone will look into it.
import requests
import urllib.request
from lxml import html
def PictureScraping():
url = "https://www.yify-torrent.org/search/1080p/"
response = requests.get(url)
tree = html.fromstring(response.text)
titles = tree.xpath('//div[#class="movie-image"]')
for title in titles:
Pics = "https:" + title.xpath('.//img/#src')[0]
urllib.request.urlretrieve(Pics, Pics.split('/')[-1])
PictureScraping()

You need to download images using the same web-scraping session you've used to get the initial page. Working code:
import requests
from lxml import html
def PictureScraping():
url = "https://www.yify-torrent.org/search/1080p/"
with requests.Session() as session:
response = session.get(url)
tree = html.fromstring(response.text)
titles = tree.xpath('//div[#class="movie-image"]')
for title in titles:
image_url = title.xpath('.//img/#src')[0]
image_name = image_url.split('/')[-1]
print(image_name)
image_url = "https:" + image_url
# download image
response = session.get(image_url, stream=True)
if response.status_code == 200:
with open(image_name, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
PictureScraping()

Related

Image source is different in html between my browser and get request

I suspect this has happened due to my misunderstanding of how either lxml or html works and I'd appreciate if someone could fill in this blank in my knowledge.
My code is:
url = "https://prnt.sc/ca0000"
response = requests.get(url,headers={'User-Agent': 'Chrome'})
# Navigate to the correct img src.
tree = html.fromstring(response.content)
xpath = '/html/body/div[3]/div/div/img/#src'
imageURL = tree.xpath(xpath)[0]
print(imageURL)
I expect when I do this to get a result such as:
data:image/png;base64,iVBORw0KGgoAAA...((THIS IS REALLY LONG))...Jggg==
Which if I understand correctly is where the image is stored locally on my computer.
However when I run the code I get:
"https://prnt.sc/ca0000"
Why are these different?
Problem is that this page uses javaScript to put data:image/png;base64 ... in place of https://prnt.sc/ca0000 but requests can't use JavaScript.
But there are two img with different scr - first has standard URL to image (https:///....) and other has fake https://prnt.sc/ca0000
So this xpath works for me even without JavaScript
xpath = '//img[#id="screenshot-image"]/#src'
This code get correct url and download image.
import requests
from lxml import html
url = "https://prnt.sc/ca0000"
response = requests.get(url, headers={'User-Agent': 'Chrome'})
tree = html.fromstring(response.content)
image_url = tree.xpath('//img[#id="screenshot-image"]/#src')[0]
print(image_url)
# -- download ---
response = requests.get(image_url, headers={'User-Agent': 'Chrome'})
with open('image.png', 'wb') as fh:
fh.write(response.content)
Result
https://image.prntscr.com/image/797501c08d0a46ae93ff3a477b4f771c.png

simple download.file() in r is not working with requests.get

I am attempting to convert R code to python code. There is a current line that I am having trouble with. (code snip 1).
I have tried all variations of requests and the python code is creating a blank file with none of the contents.
Requests, wget, urllib.requests, etc. etc.
(1)
downloader = download.file(url = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm',destfile = 'C:/Users/bnewell/Desktop/test.xml",quiet = TRUE) # DOWNLOADING XML FILE FROM SITE
unfiltered = xmlToList(xmlParse(download_file))
(2)
import requests
URL = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm'
response = requests.head(URL, allow_redirects=True)
import requests, shutil
URL = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm'
page = requests.get(URL, stream=True, allow_redirects=True,
headers={'user-agent': 'MyPC'})
with open("File.xml", "wb") as f:
page.raw.decode_content = True
shutil.copyfileobj(page.raw, f)
Manually adding a user-agent header the file download for some reason I'm not sure about.
I use shutil to download the raw file which could be replaced by page.iter_content
try to actually get the request
import requests
URL = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm'
response = requests.get(URL, headers={'allow_redirects':True})
Then you can access what you are downloading with response.raw, response.text, response.content etc.
For more details see the actual docs
Try something like this instead:
import os
import requests
url = "htts://......"
r = requests.get(url , stream=True, allow_redirects=True)
if r.status_code != 200:
print("Download failed:", r.status_code, r.headers, r.text)
file_path = r"C:\data\...."
with open(file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 8):
if chunk:
f.write(chunk)
f.flush()
os.fsync(f.fileno())

urllib.error.HTTPError: HTTP Error 400: Bad Request from trying to get a set of images

I was trying to get a set of images for a dataset to train tiny-yolo on so I followed this tutorial
https://www.youtube.com/watch?v=Lg4T9iJkwhE&index=5&list=PLX-LrBk6h3wSGvuTnxB2Kj358XfctL4BM
The first file that was written in the tutorial goes like this:
import os
import urllib.request as ulib
from bs4 import BeautifulSoup as Soup
import json
url_a = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q={}'
url_b = '\&tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start={}'
url_c = '\&yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg'
url_d = '\.i&ijn=1&asearch=ichunk&async=_id:rg_s,_pms:s'
url_base = ''.join((url_a, url_b, url_c, url_d))
headers = {'User-Agent': 'Chrome/67.0.3396.99 Safari/537.36'}
def get_links(search_name):
search_name = search_name.replace(' ', '+')
url = url_base.format(search_name, 0)
request = ulib.Request(url, None, headers)
json_string = ulib.urlopen(request).read()
page = json.loads(json_string)
new_soup = Soup(page[1][1], 'lxml')
images = new_soup.find_all('img')
links = [image['src'] for image in images]
return links
def save_images(links, search_name):
directory = search_name.replace(' ', '_')
if not os.path.isdir(directory):
os.mkdir(directory)
for i, link in enumerate(links):
savepath = os.path.join(directory, '{:06}.png'.format(i))
ulib.urlretrieve(link, savepath)
if __name__ == '__main__':
search_name = 'my search query'
links = get_links(search_name)
save_images(links, search_name)
The only thing I changed was the headers variable since my User-agent is different than the guy in the tutorial's one.
To my supprise the script returned this error
urllib.error.HTTPError: HTTP Error 400: Bad Request
Can anyone tell me what's wrong?
The Error code itself explains the issue
you are trying to hit URL which doesn't exist.
Please correct your URL.
URL:- https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q=my+search+query\\&tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start=0\\&yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg\\.i&ijn=1&asearch=ichunk&async=_id:rg_s,_pms:s%27
Try on Browser, see what you get.

BeautifulSoup scraper downloaded images are corrupt

I greatly need help for my code. I was attempting to do an exercise from a book and I followed it exactly. The code worked and it downloaded the images. However, all the images that was downloaded were corrupted. I have no idea whats causing it or what I missed.
Thanks.
#! python3
# downloadXkcd.py - Downloads every single XKCD comic.
import requests, os, bs4
url = 'http://xkcd.com'
os.makedirs('xkcd', exist_ok=True)
while not url.endswith('#'):
# Download the page.
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text,'html.parser')
# Find the URL of the comic image.
comicElem = soup.select('#comic img')
if comicElem == []:
print('Could not find comic image')
else:
comicUrl = comicElem[0].get('src')
# Download the image.
print('Downloading image %s' %(comicUrl))
res.raise_for_status()
# Save the image to ./xkcd.
imagefile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb')
for chunk in res.iter_content(100000):
imagefile.write(chunk)
imagefile.close()
# Get the prev button's url
prevlink = soup.select('a[rel="prev"]')[0]
url = 'http://xkcd.com' + prevlink.get('href')
print('Done')
You are writing wrong data to the file:
for chunk in res.iter_content(100000)
res is the data of the webpage. You should be getting the data of the image with the url comicUrl instead. I think you forgot this line:
print('Downloading image %s' %(comicUrl))
res = requests.get('http:' + comicUrl)
Note: I added http: before the url because the image urls you are extracting lack this. You should define a function to check whether it is necessary to add this schema.

Images downloaded are blank images, instead of actual images

For learning purposes I am trying to download all the posts images of a Buzzfeed article.
Here is my code:
import lxml.html
import string
import random
import requests
url ='http://www.buzzfeed.com/mjs538/messages-from-creationists-to-people-who-believe-in-evolutio?bftw'
headers = headers = {
'User-Agent': 'Mozilla/5.0',
'From': 'admin#jhvisser.com'
}
page= requests.get(url)
tree = lxml.html.fromstring(page.content)
#print(soup.prettify()).encode('ascii', 'ignore')
images = tree.cssselect("div.sub_buzz_content img")
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for x in range(size))
for image in images:
with open(id_generator() + '.jpg', 'wb') as handle:
request = requests.get(image.attrib['src'], headers=headers, stream=True)
for block in request.iter_content(1024):
if not block:
break
handle.write(block)
What is retrieved are images all 110 bytes in size, and viewing them is just an empty image. Am I do something wrong in my code here that is causing the issue? I don't have to use requests if there is an easier way to do this.
If you look closely at the source code of the webpage you are trying to crawl, you'll see that the image url's you want are not specified in the src attribute of the img tags, but in the rel:bf_image_src attribute.
Changing image.attrib['src'] to image.attrib['rel:bf_image_src'] should fix your problem.
I didn't manage to replicate your code (it claims that cssselect isn't installed), but this code with BeautifulSoup and urllib2 run smoothly on my computer, and download all 22 pictures.
from itertools import count
from bs4 import BeautifulSoup
import urllib2
from time import sleep
url ='http://www.buzzfeed.com/mjs538/messages-from-creationists-to-people-who-believe-in-evolutio?bftw'
headers = {
'User-Agent': 'Non-commercical crawler, Steinar Lima. Contact: https://stackoverflow.com/questions/21616904/images-downloaded-are-blank-images-instead-of-actual-images'
}
r = urllib2.Request(url, headers=headers)
soup = BeautifulSoup(urllib2.urlopen(r))
c = count()
for div in soup.find_all('div', id='buzz_sub_buzz'):
for img in div.find_all('img'):
print img['rel:bf_image_src']
with open('images/{}.jpg'.format(next(c)), 'wb') as img_out:
req = urllib2.Request(img['rel:bf_image_src'], headers=headers)
img_out.write(urllib2.urlopen(req).read())
sleep(5)

Categories