I am downloading pdf from a link, the link that is correct, but when it comes to accessing me throws an exception
Error
raise InvalidURL(f"URL can't contain control characters. {url!r} "
http.client.InvalidURL: URL can't contain control characters. '/pnp/archivos/portal/doc/1305doc_NP 3215 DESTRUYEN POZA DE MACERACI%C3%93N Y GRAN CANTIDADDE INSUMOS QU%C3%8DMICOS.pdf' (found at least ' ')
Code
import unittest
from urlunshort3 import UrlUnshortener
from urllib.request import Request, urlopen
import urllib.request
def download_file2(download_url):
print(download_url)
url = download_url
response = urllib.request.urlopen(url)
data = response.read()
with open('C:/Users/usuario/Desktop/files/example.pdf', 'wb') as archivo:
archivo.write(data)
with open('C:/Users/usuario/Desktop/files/example.pdf', 'r') as archivo:
print("True")
download_file2(UrlUnshortener().resolve_short("http://bit" + ".ly/31wMeIN"))
Try this:
from urllib.request import Request, urlopen
import urllib.request
import urllib.parse
def download_file2(download_url):
print(download_url)
url = urllib.parse.quote(download_url)
response = urllib.request.urlopen(url)
data = response.read()
with open('C:/Users/usuario/Desktop/files/examle.pdf', 'wb') as archivo:
archivo.write(data)
with open('C:/Users/usuario/Desktop/files/example.pdf', 'r') as archivo:
print("True")
Also try urllib.parse.quote_plus() if your url contains spaces to change them to plus signs.
i think you can use wget
download from : https://pypi.org/project/wget/
import wget
wget.download(url)
Related
This code works fine, but how do I stop 404 pages from downloading? Urllib requests always return 403 because the host does not allow python. Is there another way to detect if the file exists?
import requests
import os
while True:
id = input("Enter ID:")
if os.path.exists("1.mp3"):
os.remove("1.mp3")
url = 'http://www.texture.ml/kcl/{0}.mp3'.format(id)
r = requests.get(url)
with open("1.mp3", 'wb') as f:
f.write(r.content)
You need to just add a check for it. For example
import requests
import os
while True:
id = input("Enter ID:")
if os.path.exists("1.mp3"):
os.remove("1.mp3")
url = 'http://www.texture.ml/kcl/{0}.mp3'.format(id)
r = requests.get(url)
if r.status_code != 404:
with open("1.mp3", 'wb') as f:
f.write(r.content)
This is the code I wrote in python for opening a url.
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import time
import requests
from random import randint
import urllib.parse
class AmazonReviews():
def __init__(self):
self.headers = {'User-Agent' : 'Mozilla/5.0'}
def open_url(self,url):
values = {}
data = urllib.parse.urlencode(values).encode('utf-8')
req = urllib.request.Request(url, data, self.headers)
response = urllib.request.urlopen(req)
html = response.read()
return html
def fetch_reviews(self,all_reviews_link):
try:
url = "https://www.amazon.in" + all_reviews_link
print(url)
html = self.open_url(url)
except HTTPError as e:
print(e)
review = AmazonReviews()
review.fetch_reviews('/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8')
I am passing url as such because in the main project this url is scraped using href attribute that gives the relative path.
If there is any method to get absolute url please suggest.
Output -
https://www.amazon.in/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8
HTTP Error 404: NotFound
Link of the code
https://onlinegdb.com/SyFPXzWVI
Use Selenium instead:
from selenium import webdriver
import os
browser = webdriver.Chrome(executable_path=os.path.abspath(os.getcwd()) + "/chromedriver")
link = "https://www.amazon.in/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8"
browser.get(link)
name = browser.find_element_by_xpath('//*[#id="customer-profile-name-header"]/div[2]/span').text
Output:
Dheeraj Malhotra
I am attempting to convert R code to python code. There is a current line that I am having trouble with. (code snip 1).
I have tried all variations of requests and the python code is creating a blank file with none of the contents.
Requests, wget, urllib.requests, etc. etc.
(1)
downloader = download.file(url = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm',destfile = 'C:/Users/bnewell/Desktop/test.xml",quiet = TRUE) # DOWNLOADING XML FILE FROM SITE
unfiltered = xmlToList(xmlParse(download_file))
(2)
import requests
URL = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm'
response = requests.head(URL, allow_redirects=True)
import requests, shutil
URL = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm'
page = requests.get(URL, stream=True, allow_redirects=True,
headers={'user-agent': 'MyPC'})
with open("File.xml", "wb") as f:
page.raw.decode_content = True
shutil.copyfileobj(page.raw, f)
Manually adding a user-agent header the file download for some reason I'm not sure about.
I use shutil to download the raw file which could be replaced by page.iter_content
try to actually get the request
import requests
URL = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm'
response = requests.get(URL, headers={'allow_redirects':True})
Then you can access what you are downloading with response.raw, response.text, response.content etc.
For more details see the actual docs
Try something like this instead:
import os
import requests
url = "htts://......"
r = requests.get(url , stream=True, allow_redirects=True)
if r.status_code != 200:
print("Download failed:", r.status_code, r.headers, r.text)
file_path = r"C:\data\...."
with open(file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 8):
if chunk:
f.write(chunk)
f.flush()
os.fsync(f.fileno())
Tried to crawl restaurants address from google front page information panel but getting "urllib.error.HTTPError: HTTP Error 403: Forbidden"
error and program are not run.
I am fresher in python web scraping, please help.
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import re
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
#get google URL.
url = "https://www.google.com/search?q=barbeque%20nation%20-%20noida"
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
page = fromstring(response)
soup = BeautifulSoup(page, 'url.parser')
the_page = soup.prettify("utf-8")
hotel_json = {}
for line in soup.find_all('script',attrs={"type" :
"application/ld+json"}):
details = line.text.strip()
details = json.loads(details)
hotel_json["address"]["LrzXr"]=details["address"]["streetAddress"]
break
with open(hotel_json["name"]+".html", "wb") as file:
file.write(html)
with open(hotel_json["name"]+".json", 'w') as outfile:
json.dump(hotel_json, outfile, indent=4)
Add a user-agent header
request = urllib.request.Request(url, headers = {'User-Agent' : 'Mozilla/5.0'})
i have a redirection url
www.test.com
it will redirect me to
www.test.com/XXYYXXYY
ans every time when i open it will redirect me to a new url ( XXYYXXYY will change every time )
so i want to save them into a CSV file
import urllib2
import csv
import sys
url = 'http://www.test.com'
u = urllib2.urlopen(url)
localFile = open('file.csv', 'w')
localFile.write(u.read())
localFile.close()
is this a correct code ?
thank you
geturl() will give you the final URL
import urllib2
import csv
import sys
url = 'http://www.test.com'
u = urllib2.urlopen(url)
localFile = open('file.csv', 'w')
localFile.write(u.geturl())
localFile.close()