Python code:
import urllib2
import requests
info_hash = '00001BD2C9F364C7DCB759DEC6BE02F913C96F72'
url = 'http://torrage.com/torrent/%s.torrent' % info_hash
print url
data = urllib2.urlopen(url).read() # this data is err
# data = requests.get(url).content # this data is ok
f = open('%s.torrent' % info_hash, 'wb')
f.write(data)
f.close()
I can't get right torrent content by the code, but I can get right torrent by the url in browser.
Related
I have written this script that will retrieve the contents of a web page.
import requests
import bs4
with requests.session() as r:
r = requests.get("https://www.example.com")
response = r.text
print(response)
However, I have a list of URLs in a text file. Is there any way I can pass the contents of this file directly to requests.get() instead of typing each one manually.
Just put it all in a loop.
import requests
import bs4
text_file_name = "list_of_urls.txt"
with requests.session() as session:
with open(text_file_name) as file:
for line in file:
url = line.strip()
if url:
resp = session.get(url)
response = resp.text
print(response)
note: you weren't using the requests session object, so fixed that.
You can just use a loop
Assuming file.txt is your file:
with requests.session() as r:
with open('file.txt') as f:
for line in f:
r = requests.get(line)
response = r.text
print(response)
You can try to loop at all the files and execute a requests.get() for each one
import requests
import bs4
with requests.session() as r:
with open("urls.txt", "r") as f:
urls = list(f.readlines())
for url in urls:
r = requests.get(url)
response = r.text
print("Response for " + url)
print(response)
import requests
file1 = open('myfile.txt', 'r')
URLS = file1.readlines()
for url in URLS:
r = requests.get(url)
response = r.text
print(response)
This would print the text content of all the URLs
This code works fine, but how do I stop 404 pages from downloading? Urllib requests always return 403 because the host does not allow python. Is there another way to detect if the file exists?
import requests
import os
while True:
id = input("Enter ID:")
if os.path.exists("1.mp3"):
os.remove("1.mp3")
url = 'http://www.texture.ml/kcl/{0}.mp3'.format(id)
r = requests.get(url)
with open("1.mp3", 'wb') as f:
f.write(r.content)
You need to just add a check for it. For example
import requests
import os
while True:
id = input("Enter ID:")
if os.path.exists("1.mp3"):
os.remove("1.mp3")
url = 'http://www.texture.ml/kcl/{0}.mp3'.format(id)
r = requests.get(url)
if r.status_code != 404:
with open("1.mp3", 'wb') as f:
f.write(r.content)
I am attempting to convert R code to python code. There is a current line that I am having trouble with. (code snip 1).
I have tried all variations of requests and the python code is creating a blank file with none of the contents.
Requests, wget, urllib.requests, etc. etc.
(1)
downloader = download.file(url = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm',destfile = 'C:/Users/bnewell/Desktop/test.xml",quiet = TRUE) # DOWNLOADING XML FILE FROM SITE
unfiltered = xmlToList(xmlParse(download_file))
(2)
import requests
URL = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm'
response = requests.head(URL, allow_redirects=True)
import requests, shutil
URL = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm'
page = requests.get(URL, stream=True, allow_redirects=True,
headers={'user-agent': 'MyPC'})
with open("File.xml", "wb") as f:
page.raw.decode_content = True
shutil.copyfileobj(page.raw, f)
Manually adding a user-agent header the file download for some reason I'm not sure about.
I use shutil to download the raw file which could be replaced by page.iter_content
try to actually get the request
import requests
URL = 'https://www.equibase.com/premium/eqbLateChangeXMLDownload.cfm'
response = requests.get(URL, headers={'allow_redirects':True})
Then you can access what you are downloading with response.raw, response.text, response.content etc.
For more details see the actual docs
Try something like this instead:
import os
import requests
url = "htts://......"
r = requests.get(url , stream=True, allow_redirects=True)
if r.status_code != 200:
print("Download failed:", r.status_code, r.headers, r.text)
file_path = r"C:\data\...."
with open(file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 8):
if chunk:
f.write(chunk)
f.flush()
os.fsync(f.fileno())
Tried to crawl restaurants address from google front page information panel but getting "urllib.error.HTTPError: HTTP Error 403: Forbidden"
error and program are not run.
I am fresher in python web scraping, please help.
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import re
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
#get google URL.
url = "https://www.google.com/search?q=barbeque%20nation%20-%20noida"
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
page = fromstring(response)
soup = BeautifulSoup(page, 'url.parser')
the_page = soup.prettify("utf-8")
hotel_json = {}
for line in soup.find_all('script',attrs={"type" :
"application/ld+json"}):
details = line.text.strip()
details = json.loads(details)
hotel_json["address"]["LrzXr"]=details["address"]["streetAddress"]
break
with open(hotel_json["name"]+".html", "wb") as file:
file.write(html)
with open(hotel_json["name"]+".json", 'w') as outfile:
json.dump(hotel_json, outfile, indent=4)
Add a user-agent header
request = urllib.request.Request(url, headers = {'User-Agent' : 'Mozilla/5.0'})
I would like to write a program to upload files on http://www.tinyupload.com/, so I searched a method to upload a form.
I have written this code to upload a file:
import urllib.request
import urllib.parse
import http.cookiejar
import re
# Use cookies
cookie = http.cookiejar.CookieJar()
urllib.request.install_opener(urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie)))
# A function to download an url (GET)
def urldownload(url):
try:
page = urllib.request.urlopen(url)
return page.read().decode('iso-8859-2')
except urllib.error.HTTPError:
return False
# Get the url from the form to upload
def geturl(html):
regex = re.compile('\<form action="(.*?)\" name="upload_form"', re.S)
url = regex.findall(html)[0]
return(str(url))
# Get the sid from the url
def getsessionid(url):
return url[-26:]
# Upload a file
def upload(file):
url = geturl(urldownload('http://s000.tinyupload.com/index.php'))
sessionid = getsessionid(url)
f = open(file).read()
data = {'MAX_FILE_SIZE': '52428800',
'uploaded_file': f,
'file_description': 'File: %s' % (file),
'sessionid': sessionid}
data = urllib.parse.urlencode(data)
result = urllib.request.urlopen(url, data.encode('iso-8859-2'))
return(result.read().decode('iso-8859-2'))
#return(str(result.info()))
I should go to the page where is the link to download the file, but I have the form. What is wrong??
There is more simple way to upload file - use requests library.
import requests
session = requests.Session()
index_url = 'http://s000.tinyupload.com/index.php'
upload_url = 'http://s000.tinyupload.com/cgi-bin/upload.cgi?sid='
index_request = session.get(index_url)
PHPSESSID = index_request.cookies['PHPSESSID']
files = {'file': open('bitcoin.pdf', 'rb')}
r = requests.post(upload_url+PHPSESSID, files=files)
#Print "File upload finished" page
print r.text
#Print download link
import re
print re.search('http://s000\.tinyupload.com/\?file_id=[^<]+',r.text).group(0)