I'm new to python and I'm writing a code to upload a file using urllib2 but I can't make it work.
Here's the code:
class Get(object):
handlers = list()
def __init__(self,url):
self.url = url
self.request = urllib2.Request(url)
self.request.add_header('User-Agent',"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13")
def auth(self,username,password):
pass_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
pass_mgr.add_password(None, self.url, username, password)
handler = urllib2.HTTPBasicAuthHandler(pass_mgr)
self._add_handler(handler)
def perform(self):
try:
opener = self._opener()
res = opener.open(self.request)
to_return = {
'code': res.code,
'contents': res.read(),
'url': res.geturl(),
'headers': dict(res.info())
}
except urllib2.URLError as e:
if hasattr(e, 'reason'):
print 'Error accessing the server.'
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'The server couldn\'t fulfill the request.'
print e
else:
return to_return
def _add_handler(self,handler):
self.handlers.append(handler)
def _opener(self):
return urllib2.build_opener(*self.handlers)
class Multipart(object):
def __init__(self,url):
super(Multipart,self).__init__(url)
self.data = list()
def perform(self):
b = choose_boundary()
tmp = "".join(map(lambda x: "--%s \r\n %s" % (b,x), self.data ))
tmp += "--%s--\r\n" % b
self.request.add_data(tmp)
content_type = 'multipart/form-data; boundary=%s' % b
self.request.add_unredirected_header('Content-Type', content_type)
#self.request.add_header("Content-Type","multipart/form-data, boundary=%s" % b)
return super(HTTP.Multipart,self).perform()
def set_data(self,data,file = None):
for i in data:
if file:
self.data.append(self._encode_file(i,**data[i]))
else:
self.data.append(self._encode_text(i,data[i]))
def _encode_text(self,key,value):
return "Content-Disposition: form-data; name=\"%s\"\r\n\r\n%s\r\n" % (key, value)
def _encode_file(self,key,path,filename=None,mime_type=None):
if not exists(path):
raise RuntimeError('%s not found' % path)
fname = filename or basename(path)
mime = mime_type or guess_type(path)[0] or 'application/octet-stream'
size = getsize(path)
content = ""
with open(path,'rb') as fobj:
content = fobj.read(size)
converted_text = "Content-Disposition: form-data; name=\"%s\"; filename=\"%s\"\r\n" % (key,fname)
converted_text += "Content-Transfer-Encoding: binary\r\n"
converted_text += "Content-Type: %s \r\n" % mime
converted_text += "Content-Length: %s \r\n" % size
converted_text += "\r\n %s \r\n" % content
return converted_text
Thanks MultipartPostHandler everything is working fine now.
class Post(Get): # inherits the Get class above
def __init__(self,url,data,multipart=False):
super(HTTP.Post,self).__init__(url)
if multipart:
from MultipartPostHandler import MultipartPostHandler
self._add_handler(MultipartPostHandler)
self.request.add_data(data)
else:
self.request.add_data(urlencode(data))
Related
I need to check the compatibility of the GPS device communication protocol with the logging server. The application allows you to perform a test that compares the sent frame with the protocols it supports and checks whether it is possible to read this frame by already supported protocols. For this purpose, I wanted to use the shared test, but I get an error.
https://github.com/traccar/traccar/blob/master/tools/test-integration.py
```
import sys
import os
import xml.etree.ElementTree
import urllib
import urllib.parse
import urllib.request as urllib2
import json
import socket
import time
messages = {
'gps103' : 'imei:123456789012345,help me,1201011201,,F,120100.000,A,6000.0000,N,13000.0000,E,0.00,;'
}
baseUrl = 'http://172.16.43.210:8082'
user = { 'email' : 'admin', 'password' : 'admin' }
debug = '-v' in sys.argv
def load_ports():
ports = {}
dir = os.path.dirname(os.path.abspath(__file__))
root = xml.etree.ElementTree.parse(dir + '\\default.xml').getroot()
for entry in root.findall('entry'):
key = entry.attrib['key']
if key.endswith('.port'):
ports[key[:-5]] = int(entry.text)
if debug:
print('\nports: %s\n' % repr(ports))
return ports
def login():
request = urllib2.Request(baseUrl + '/api/session')
response = urllib2.urlopen(request, urllib.parse.urlencode(user))
if debug:
print('\nlogin: %s\n' % repr(json.load(response)))
return response.headers.get('Set-Cookie')
def remove_devices(cookie):
request = urllib2.Request(baseUrl + '/api/devices')
request.add_header('Cookie', cookie)
response = urllib2.urlopen(request)
data = json.load(response)
if debug:
print ('\ndevices: %s\n' % repr(data))
for device in data:
request = urllib2.Request(baseUrl + '/api/devices/' + str(device['id']))
request.add_header('Cookie', cookie)
request.get_method = lambda: 'DELETE'
response = urllib2.urlopen(request)
def add_device(cookie, unique_id):
request = urllib2.Request(baseUrl + '/api/devices')
request.add_header('Cookie', cookie)
request.add_header('Content-Type', 'application/json')
device = { 'name' : unique_id, 'uniqueId' : unique_id }
response = urllib2.urlopen(request, json.dumps(device))
data = json.load(response)
return data['id']
def send_message(port, message):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(('127.0.0.1', port))
s.send(message)
s.close()
def get_protocols(cookie, device_id):
params = { 'deviceId' : device_id, 'from' : '2000-01-01T00:00:00.000Z', 'to' : '2050-01-01T00:00:00.000Z' }
request = urllib2.Request(baseUrl + '/api/positions?' + urllib.urlencode(params))
request.add_header('Cookie', cookie)
request.add_header('Content-Type', 'application/json')
request.add_header('Accept', 'application/json')
response = urllib2.urlopen(request)
protocols = []
for position in json.load(response):
protocols.append(position['protocol'])
return protocols
ports = load_ports()
cookie = login()
remove_devices(cookie)
devices = {
'123456789012345' : add_device(cookie, '123456789012345'),
'123456789012' : add_device(cookie, '123456789012'),
'1234567890' : add_device(cookie, '1234567890'),
'123456' : add_device(cookie, '123456'),
'1234' : add_device(cookie, '1234')
}
all = set(ports.keys())
protocols = set(messages.keys())
print ('Total: %d' % len(all))
print ('Missing: %d' % len(all - protocols))
print ('Covered: %d' % len(protocols))
#if all - protocols:
# print '\nMissing: %s\n' % repr(list((all - protocols)))
for protocol in messages:
send_message(ports[protocol], messages[protocol])
time.sleep(10)
for device in devices:
protocols -= set(get_protocols(cookie, devices[device]))
print ('Success: %d' % (len(messages) - len(protocols)))
print ('Failed: %d' % len(protocols))
if protocols:
print ('\nFailed: %s' % repr(list(protocols)))
```
And i got this error:
File "C:\Users\ISUIT\Desktop\Ew.Prac\tt.py", line 159, in <module>
cookie = login()
File "C:\Users\ISUIT\Desktop\Ew.Prac\tt.py", line 112, in login
response = urllib2.urlopen(request, urllib.parse.urlencode(user))
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.1776.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.1776.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 514, in open
req = meth(req)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.1776.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1277, in do_request_
raise TypeError(msg)
TypeError: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str.
I try to use bytes(string, 'utf-8') but it doesn't work
Use string.encode('utf-8') to cast the type to bytes.
I was trying to download a full course from LinkedIn Learning using a
code from GitHub. I have already downloaded a couple of courses
before but this time, when I tried to download another course, the
error appeared.
PS: I do have a premium LinkedIn membership. Can't watch a course online all the time - that's why I download on my PC.
# -*- coding: utf-8 -*-
import requests
from requests import Session
from bs4 import BeautifulSoup
import urllib
import sys
import re
import os
import string
import config
import logging
reload(sys)
sys.setdefaultencoding('utf-8')
login_url = 'https://www.linkedin.com/'
post_login_url = 'https://www.linkedin.com/uas/login-submit'
course_api_url = 'https://www.linkedin.com/learning-api/detailedCourses??fields=fullCourseUnlocked,releasedOn,' \
'exerciseFileUrls,exerciseFiles&addParagraphsToTranscript=true&courseSlug=%s&q=slugs'
video_api_url = 'https://www.linkedin.com/learning-api/detailedCourses?addParagraphsToTranscript=false&courseSlug=%s' \
'&q=slugs&resolution=_720&videoSlug=%s'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/66.0.3359.181 Safari/537.36'
}
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
class Lld:
def __init__(self):
self.session = Session()
self.base_path = config.BASE_DOWNLOAD_PATH if config.BASE_DOWNLOAD_PATH else 'out'
#staticmethod
def plain_cookies(cookies):
plain = ''
for k, v in cookies.iteritems():
plain += k + '=' + v + '; '
return plain[:-2]
#staticmethod
def format_string(raw_string):
replacement_dict = {u'Ä': 'Ae', u'Ö': 'Oe', u'Ü': 'Ue', u'ä': 'ae', u'ö': 'oe', u'ü': 'ue', ':': ' -'}
invalid_chars = r'[^A-Za-z0-9\-\.]+'
u_map = {ord(key): unicode(val) for key, val in replacement_dict.items()}
raw_string = raw_string.translate(u_map)
raw_string = re.sub(invalid_chars, ' ', raw_string).strip().encode('utf-8')
i = 0
for c in raw_string:
if c in string.ascii_letters:
break
i += 1
return raw_string[i:]
#staticmethod
def format_time(ms):
seconds, milliseconds = divmod(ms, 1000)
minitues, seconds = divmod(seconds, 60)
hours, minitues = divmod(minitues, 60)
return '%d:%02d:%02d,%02d' % (hours, minitues, seconds, milliseconds)
def download_file(self, url, path, file_name):
resp = self.session.get(url, stream=True)
if not os.path.exists(path):
os.makedirs(path)
try:
with open(path + '/' + file_name, 'wb') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
except Exception as e:
os.remove(path + '/' + file_name)
print(e)
def download_sub(self, subs, path, file_name):
with open(path + '/' + file_name, 'a') as f:
i = 1
for sub in subs:
t_start = sub['transcriptStartAt']
if i == len(subs):
t_end = t_start + 5000
else:
t_end = subs[i]['transcriptStartAt']
caption = sub['caption']
f.write('%s\n' % str(i))
f.write('%s --> %s\n' % (self.format_time(t_start), self.format_time(t_end)))
f.write('%s\n\n' % caption)
i += 1
def download_desc(self, desc, course_url, path, file_name):
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/' + file_name, 'a') as f:
f.write('%s\n\n%s' % (desc, course_url))
def get_logged_session(self):
logging.info('Authenticating to LinkedIn')
login_page = BeautifulSoup(self.session.get(login_url).text, 'html.parser')
csrf = login_page.find(id='loginCsrfParam-login')['value']
logging.info('Csfr token: %s' % csrf)
login_data = urllib.urlencode(
{'session_key': config.USERNAME, 'session_password': config.PASSWORD, 'isJsEnabled': 'false',
'loginCsrfParam': csrf})
headers['Cookie'] = self.plain_cookies(requests.utils.dict_from_cookiejar(self.session.cookies))
self.session.headers.update(headers)
resp = self.session.post(post_login_url, data=login_data, allow_redirects=True)
if resp.status_code != 200:
logging.error('Could not authenticate to LinkedIn')
else:
logging.info('Authentication successfully completed')
def download_courses(self):
token = self.session.cookies.get('JSESSIONID').replace('"', '')
self.session.headers['Csrf-Token'] = token
self.session.headers['Cookie'] = self.plain_cookies(requests.utils.dict_from_cookiejar(self.session.cookies))
self.session.headers.pop('Accept')
for course in config.COURSES:
resp = self.session.get(course_api_url % course)
course_data = resp.json()['elements'][0]
course_name = self.format_string(course_data['title'])
logging.info('Starting download of course [%s]...' % course_name)
course_path = '%s/%s' % (self.base_path, course_name)
chapters_list = course_data['chapters']
chapter_index = 1
logging.info('Parsing course\'s chapters...')
logging.info('%d chapters found' % len(chapters_list))
for chapter in chapters_list:
chapter_name = self.format_string(chapter['title'])
logging.info('Starting download of chapter [%s]...' % chapter_name)
chapter_path = '%s/%s - %s' % (course_path, str(chapter_index).zfill(2), chapter_name)
if chapter_name == '':
chapter_path = chapter_path[:-3]
videos_list = chapter['videos']
video_index = 1
logging.info('Parsing chapters\'s videos')
logging.info('%d videos found' % len(videos_list))
for video in videos_list:
video_name = self.format_string(video['title'])
video_slug = video['slug']
video_data = (self.session.get(video_api_url % (course, video_slug)))
try:
video_url = re.search('"progressiveUrl":"(.+)","streamingUrl"', video_data.text).group(1)
except:
logging.error('Can\'t download the video [%s], probably is only for premium users' % video_name)
continue
logging.info('Downloading video [%s]' % video_name)
self.download_file(video_url, chapter_path, '%s - %s.mp4' % (str(video_index).zfill(2), video_name))
video_data = video_data.json()['elements'][0]
if config.SUBS:
try:
subs = video_data['selectedVideo']['transcript']['lines']
except KeyError:
logging.info('No subtitles avaible')
else:
logging.info('Downloading subtitles')
self.download_sub(subs, chapter_path, '%s - %s.srt' % (str(video_index).zfill(2), video_name))
video_index += 1
chapter_index += 1
exercises_list = course_data['exerciseFiles']
for exercise in exercises_list:
try:
ex_name = exercise['name']
ex_url = exercise['url']
except (KeyError, IndexError):
logging.info('Can\'t download an exercise file for course [%s]' % course_name)
else:
self.download_file(ex_url, course_path, ex_name)
description = course_data['description']
logging.info('Downloading course description')
self.download_desc(description, 'https://www.linkedin.com/learning/%s' % course, course_path, 'Description.txt')
def main():
lld = Lld()
lld.get_logged_session()
lld.download_courses()
if __name__ == '__main__':
main()
Error that appears
Traceback (most recent call last):
File "lld.py", line 187, in <module>
main()
File "lld.py", line 182, in main
lld.get_logged_session()
File "lld.py", line 104, in get_logged_session
csrf = login_page.find(id='loginCsrfParam-login')['value']
TypeError: 'NoneType' object has no attribute '__getitem__'
This error means that login_page.find(id='loginCsrfParam-login')['value'] has a null value (a value of None).
This is probably because the file you are parsing with Beautiful Soup (a very slow parser btw, but good for learning) does not contain the requested tag, or that tag does not have the 'value' attribute
EDIT:
The reason you are getting this error is that there is no tag with the id "loginCsrfParam-login"
Here is a Diagram of Whats going on in the interpreter:
Page is fetched (www.linkedin.com/index.html), this page does not contain anything with the id of "loginCsrfParam-login".
BeautifulSoup parses the page for a tag with the
"loginCsrfParam-login", it doesn't find it, so it returns None.
You didn't try to write safe code, so you didn't check the return value of parse.
Python failed because you tried to refer to the member of an empty class
your login_page doesn't contain the id and therefore you are getting the error because the object whose value you are trying to find does not exist which means it is of 'NoneType'.
I need some help figuring out why this isn't working the way it should. The goal was to stand up a local python server (similar to SimpleHTTPServer) but have it log requests/responses to a file to get a better understanding of how browser DOM behaves. I want to be able to browse to localhost and have it react like a webserver would.
When using this it appears to be hanging at localhost and not redirecting it the 'sample.html' file. Any thoughts on why this might be happening?
Code:
import os, re, socket, thread, threading;
# Settings that you may want to change:
WEBSERVER_PORT = 28876;
LOGS_FOLDER = os.path.join(os.getcwd(), 'logs');
TEST_HTML = 'sample.html';
VERBOSE_OUTPUT = True;
# Settings that you may need to change if you add new file types:
DEFAULT_MIME_TYPE = 'application/octet-stream';
REGISTERED_MIME_TYPES = {
'text/html': ['htm', 'html'],
'text/javascript': ['js'],
'image/svg+xml': ['svg'],
'image/jpeg': ['jpg'],
};
def MimeType(path):
last_dot = path.rfind('.');
if last_dot != -1:
ext = path[last_dot + 1:].lower();
for mime_type, exts in REGISTERED_MIME_TYPES.items():
if ext in exts:
return mime_type;
return DEFAULT_MIME_TYPE;
def GenerateRedirect(new_path):
return (301, 'Moved permanently', \
['Location: http://localhost:%s%s' % (WEBSERVER_PORT, new_path)], \
'text/html', '%s' % (new_path, new_path));
def Reply404(path, query, data):
return 404, 'Not found', 'text/plain', 'The path %s was not found' % path;
def ReplyFile(path, query, data):
path = os.path.join(os.getcwd(), path[1:].replace(os.altsep, os.sep));
if os.path.isfile(path):
try:
data = open(path, 'rb').read();
except:
print ' Cannot open file %s' % path;
return 500, 'Internal server error', 'text/plain', \
'The path %s could not be read' % path;
if VERBOSE_OUTPUT:
print ' Reply = file %s' % path;
return 200, 'OK', MimeType(path), data;
print ' Cannot find file %s' % path;
return 404, 'Not found', 'text/plain', 'The path %s was not found' % path;
def ReplyLog(path, query, data):
path = os.path.join(LOGS_FOLDER, path[len('/logs/'):].replace('/', os.sep));
try:
open(path, 'ab').write(data + '\r\n');
except:
print ' Cannot write to log file %s' % path;
return 500, 'Internal server error', 'text/plain', \
'The path %s could not be read' % path;
if VERBOSE_OUTPUT:
print ' Wrote %s bytes to log file %s' % (len(data), path);
return 200, 'OK', 'text/plain', 'Log successful';
# Replies contains entries in one of two forms:
# "regexp": ("mimetype", "body"),
# "regexp": function,
# The first form causes the server to reply with "200 OK" and the given body and
# mimetype. The second form requires "function" to accept the HTTP request path
# as an argument and return a tuple containing the HTTP return code, HTTP reason
# message, mimetype and body.
replies = [
(r'^/$', GenerateRedirect('/' + TEST_HTML)),
(r'^/logs/.*$', ReplyLog),
(r'^.*$', ReplyFile),
];
def Main():
if not os.path.isdir(LOGS_FOLDER):
try:
os.mkdir(LOGS_FOLDER);
except:
print 'Cannot create logs folder %s' % LOGS_FOLDER;
return;
server_socket = socket.socket();
server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1);
server_socket.bind(('', WEBSERVER_PORT));
server_socket.listen(1);
print 'Webserver running at http://localhost:%d/' % WEBSERVER_PORT;
print;
thread_counter = 1;
while 1:
client_socket, client_address = server_socket.accept();
thread = threading.Thread(target = ConnectionThread, \
args = (client_socket, client_address));
thread.start();
thread_counter += 1;
def ConnectionThread(client_socket, client_address):
try:
print '** Connection from %s:%s opened.' % client_address;
while 1:
try:
request_headers, request_content = ReadRequest(client_socket);
if request_headers is None:
break;
response = HandleRequest(request_headers, request_content);
try:
client_socket.send(response);
except socket.error, e:
raise ConnectionError('Cannot send response');
except ConnectionError, e:
print ' ConnectionError: %s' % e;
try:
client_socket.close();
except:
pass;
break;
print '** Connection from %s:%s closed' % client_address;
except:
thread.interrupt_main();
class ConnectionError(Exception):
def __init__(self, value):
self.value = value;
def __str__(self):
return self.value;
def GetContentLength(request_headers):
for request_header in request_headers:
if request_header.find(':') != -1:
name, value = request_header.split(':', 1);
if name.strip().lower() == 'content-length':
try:
return int(value);
except ValueError:
raise ConnectionError('Bad content-length value: %s' % value);
return None;
def ReadRequest(client_socket):
request = '';
request_headers = None;
request_headers_length = None;
if VERBOSE_OUTPUT:
print '>> Accepted request, reading headers...',;
while request_headers is None:
try:
request += client_socket.recv(256);
except socket.error, e:
if VERBOSE_OUTPUT:
print;
raise ConnectionError('Connection dropped while reading request headers.');
if len(request) == 0:
if VERBOSE_OUTPUT:
print;
# raise ConnectionError('Connection closed.');
return None, None;
request_headers_length = request.find('\r\n\r\n');
if request_headers_length != -1:
request_headers = request[:request_headers_length].split('\r\n')[:-1];
# One line breaks is part of the headers
request_headers_length += 2;
# The other is not part of the headers or the content:
request_content_length = GetContentLength(request_headers);
if request_content_length is None:
if VERBOSE_OUTPUT:
print '\r>> Accepted request, read %d bytes of headers and ' \
'no content.' % (request_headers_length);
return request_headers, None;
request_content = request[request_headers_length + 2:];
else:
if VERBOSE_OUTPUT:
print '\r>> Accepted request, read %d bytes of headers...' % \
len(request),;
while len(request_content) < request_content_length:
if VERBOSE_OUTPUT:
print '\r>> Accepted request, read %d bytes of headers and ' \
'%d/%d bytes of content...' % (request_headers_length, \
len(request_content), request_content_length),;
read_size = request_content_length - len(request_content);
try:
request_content += client_socket.recv(read_size);
except socket.error, e:
if VERBOSE_OUTPUT:
print;
raise ConnectionError('Connection dropped while reading request content.');
if VERBOSE_OUTPUT:
print '\r>> Accepted request, read %d bytes of headers and ' \
'%d bytes of content. %s' % (request_headers_length, \
len(request_content), ' ' * len(str(request_content_length)));
return request_headers, request_content;
def HandleRequest(request_headers, request_content):
end_method = request_headers[0].find(' ');
if end_method == -1:
raise ConnectionError('Bad request header; no method recognized');
method = request_headers[0][:end_method];
end_path = request_headers[0].find(' ', end_method + 1);
if end_path == -1:
raise ConnectionError('Bad request header; no path recognized');
path = request_headers[0][end_method + 1:end_path];
query = None;
start_query = path.find('?');
if start_query != -1:
query = path[start_query:];
path = path[:start_query];
if VERBOSE_OUTPUT:
print ' method=%s, path=%s, query=%s, %s headers' % \
(method, path, query, len(request_headers));
code, reason, mime_type, body = 404, 'Not found', 'text/plain', 'Not found';
response = None;
for path_regexp, response in replies:
if re.match(path_regexp, path):
if type(response) != tuple:
response = response(path, query, request_content);
break;
assert type(response) == tuple and len(response) in [2, 4, 5], \
'Invalid response tuple %s' % repr(response);
code, reason, headers, mime_type, body = 200, 'OK', [], 'text/plain', '';
if len(response) == 2:
mime_type, body = response;
elif len(response) == 4:
code, reason, mime_type, body = response;
else:
code, reason, headers, mime_type, body = response;
response_lines = [
'HTTP/1.1 %03d %s' % (code, reason),
'Content-Type: %s' % mime_type,
'Date: Sat Aug 28 1976 09:15:00 GMT',
'Expires: Sat Aug 28 1976 09:15:00 GMT',
'Cache-Control: no-cache, must-revalidate',
'Pragma: no-cache',
'Accept-Ranges: bytes',
'Content-Length: %d' % len(body),
] + headers + [
'',
body
];
response = '\r\n'.join(response_lines);
if VERBOSE_OUTPUT:
print '<< %s (%d bytes %s)' % \
(response.split('\r\n')[0], len(response), mime_type);
return response;
if __name__ == "__main__":
Main();
Any feedback would be greatly appreciated.
Thank you!
To get training data, I wrote a crawler to follow the top 500 websites on Alexa with a depth of 2 and write all links found to a file. Right now, it looks for all the links in the html and writes them to a file. The problem is, the crawler misses all links to ads, some of which are located in iframes or located in CSS files. How can I change my web crawler so that it scrapes all links, including ads? The relevant code can be found below.
class Crawler(object):
def __init__(self, root, depth, locked=True):
self.root = root
self.depth = depth
self.locked = locked
self.host = urlparse.urlparse(root)[1]
self.urls = []
self.links = 0
self.followed = 0
def crawl(self):
#print " in crawl"
page = Fetcher(self.root)
q = Queue()
#print "made fetcher"
try:
page.fetch()
if page.urls == []:
print "Error: could not fetch urls for %s" % (self.root)
return
#raise KeyboardInterrupt
else:
target = open("output.txt", 'w')
for url in page.urls:
q.put(url)
target.write((url+'\n').encode('utf-8'))
followed = [self.root]
target.close()
except Exception as e:
print('Error: could not fetch urls')
raise KeyboardInterrupt
'''
q = Queue()
target = open("output.txt", 'w')
for url in page.urls:
q.put(url) f
target.write((url+'\n').encode('utf-8'))
followed = [self.root]
target.close()
#print followed
'''
n = 0
while True:
try:
url = q.get()
except QueueEmpty:
break
n += 1
if url not in followed:
try:
host = urlparse.urlparse(url)[1]
if self.locked and re.match(".*%s" % self.host, host):
followed.append(url)
#print url
self.followed += 1
page = Fetcher(url)
page.fetch()
for i, url in enumerate(page):
if url not in self.urls:
self.links += 1
q.put(url)
self.urls.append(url)
with open("data.out", 'w') as f:
f.write(url)
if n > self.depth and self.depth > 0:
break
except Exception, e:
print "ERROR: Can't process url '%s' (%s)" % (url, e)
print format_exc()
class Fetcher(object):
def __init__(self, url):
self.url = url
self.urls = []
def __getitem__(self, x):
return self.urls[x]
def _addHeaders(self, request):
request.add_header("User-Agent", AGENT)
def open(self):
url = self.url
try:
request = urllib2.Request(url)
handle = urllib2.build_opener()
except IOError:
return None
return (request, handle)
def fetch(self):
request, handle = self.open()
self._addHeaders(request)
if handle:
try:
content = unicode(handle.open(request).read(), "utf-8",
errors="replace")
soup = BeautifulSoup(content)
tags = soup('a')
except urllib2.HTTPError, error:
if error.code == 404:
print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
else:
print >> sys.stderr, "ERROR: %s" % error
tags = []
except urllib2.URLError, error:
print >> sys.stderr, "ERROR: %s" % error
tags = []
for tag in tags:
href = tag.get("href")
if href is not None:
url = urlparse.urljoin(self.url, escape(href))
if url not in self:
self.urls.append(url)
def getLinks(url):
page = Fetcher(url)
page.fetch()
for i, url in enumerate(page):
print "%d. %s" % (i, url)
Static methods:
def main():
depth =2
file_in = []
reload(sys)
sys.setdefaultencoding('utf-8')
filename = "stuff.txt"
text = open(filename)
for line in text:
file_in.append(line.rstrip())
for i in file_in:
print "Crawling %s (Max Depth: %d)" % (i, depth)
crawler = Crawler(i, depth)
crawler.crawl()
print "\n".join(crawler.urls)
A lot of advertising is delivered via asynchronous javascript executed on the page. If you're just scraping the server initial output you won't be able to obtain those others links. One method would be to use a headless browser like PhantomJS to render the html to a file then use your script on that. There are other possibilities as well.
So the curl command I'm using is as follows:
cmd = "curl --write-out %{http_code} -X PUT -T " + self.basedir + putfile + " -# -o /dev/null " + self.uri + "/" + self.dist + "/" + putfile
I'd like to change this from invoking a system command to using pycurl. This way I can have more granular control over it and ultimately implement a progress bar for it. However, when I try and convert to python, my resulting script fails. Here is my efforts towards a python script:
f = open(filepath, "rb")
fs = os.path.getsize(filepath)
c = pycurl.Curl()
c.setopt(c.URL, target_url)
c.setopt(c.HTTPHEADER, ["User-Agent: Load Tool (PyCURL Load Tool)"])
c.setopt(c.PUT, 1)
c.setopt(c.READDATA, f)
c.setopt(c.INFILESIZE, int(fs))
c.setopt(c.NOSIGNAL, 1)
c.setopt(c.VERBOSE, 1)
c.body = StringIO()
c.setopt(c.WRITEFUNCTION, c.body.write)
try:
c.perform()
except:
import traceback
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
f.close()
c.close()
sys.stdout.write(".")
sys.stdout.flush()
Here's what that outputs:
* About to connect() to ************ port 8090 (#0)
* Trying 16.94.124.53... * connected
> PUT /incoming/ HTTP/1.1
Host: ***********
Accept: */*
User-Agent: Load Tool (PyCURL Load Tool)
Content-Length: 21
Expect: 100-continue
< HTTP/1.1 100 Continue
* We are completely uploaded and fine
< HTTP/1.1 500 Internal Server Error
< Content-type: text/html
* no chunk, no close, no size. Assume close to signal end
<
Thanks in advance for you help!
I've did uploading working module, you can find your answers looking in code.
And you can find almost all answers regarding pycurl by digging libcurl examples and Docs.
'''
Created on Oct 22, 2013
#author: me
'''
import pycurl
import os
import wx
import sys
import hashlib
from cStringIO import StringIO
def get_file_hash(full_filename):
BLOCKSIZE = 65536
hasher = hashlib.md5()
with open(full_filename, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
return hasher.hexdigest()
class FtpUpload(object):
def __init__(self, server, username, password, **items):
self.server = server
self.username = username
self.password = password
self.gauge = items.get("gauge")
self.sb_speed = items.get("sb_speed")
self.upload_file_size = items.get("upload_file_size")
self.upload_file_speed = items.get("upload_file_speed")
self.filesize = 0
self.ftp_filehash = '0'
def sizeToNiceString(self, byteCount):
for (cutoff, label) in [(1024*1024*1024, "GB"), (1024*1024, "MB"), (1024, "KB")]:
if byteCount >= cutoff:
return "%.2f %s" % (byteCount * 1.0 / cutoff, label)
if byteCount == 1:
return "1 byte"
else:
return "%d bytes" % byteCount
def initRange(self, filesize):
self.filesize = filesize
self.gauge.SetRange(filesize)
def updateValue(self, upload_d):
upload_d_int = int(upload_d)
self.gauge.SetValue(upload_d_int)
upload_d_str = self.sizeToNiceString(upload_d)
upload_percent = int((upload_d*100)/self.filesize)
upload_d_status = "{0}/{1} ({2}%)".format(upload_d_str, self.sizeToNiceString(self.filesize), upload_percent)
self.sb_speed.SetStatusText(upload_d_status, 1)
self.upload_file_size.SetLabel(upload_d_status)
self.upload_file_speed.SetLabel(upload_d_str)
def progress(self, download_t, download_d, upload_t, upload_d):
self.updateValue(upload_d)
def test(self, debug_type, debug_msg):
if len(debug_msg) < 300:
print "debug(%d): %s" % (debug_type, debug_msg.strip())
def ftp_file_hash(self, buf):
sys.stderr.write("{0:.<20} : {1}\n".format('FTP RAW ', buf.strip()))
ftp_filehash = dict()
item = buf.strip().split('\n')[0]
ext = item.split('.')[1]
if len(ext) > 3:
ftp_filename = item[:-33]
ftp_filehash = item[-32:]
self.ftp_filehash = ftp_filehash
def get_ftp_file_hash(self, filename):
c = pycurl.Curl()
list_file_hash = 'LIST -1 ' + filename + "_*"
sys.stderr.write("{0:.<20} : {1} \n".format('FTP command ', list_file_hash))
c.setopt(pycurl.URL, self.server)
c.setopt(pycurl.USERNAME, self.username)
c.setopt(pycurl.PASSWORD, self.password)
c.setopt(pycurl.VERBOSE, False)
c.setopt(pycurl.DEBUGFUNCTION, self.test)
c.setopt(pycurl.CUSTOMREQUEST, list_file_hash)
c.setopt(pycurl.WRITEFUNCTION, self.ftp_file_hash)
c.perform()
c.close()
def delete_ftp_hash_file(self, ftp_hash_file_old):
c = pycurl.Curl()
delete_hash_file = 'DELE ' + ftp_hash_file_old
sys.stderr.write("{0:.<20} : {1} \n".format('FTP command ', delete_hash_file))
c.setopt(pycurl.URL, self.server)
c.setopt(pycurl.USERNAME, self.username)
c.setopt(pycurl.PASSWORD, self.password)
c.setopt(pycurl.VERBOSE, False)
c.setopt(pycurl.DEBUGFUNCTION, self.test)
c.setopt(pycurl.CUSTOMREQUEST, delete_hash_file)
try:
c.perform()
except Exception as e:
print e
c.close()
def upload(self, full_filename, filesize):
self.initRange(filesize)
filename = os.path.basename(full_filename)
sys.stderr.write("filename: %s\n" % full_filename)
c = pycurl.Curl()
c.setopt(pycurl.USERNAME, self.username)
c.setopt(pycurl.PASSWORD, self.password)
c.setopt(pycurl.VERBOSE, False)
c.setopt(pycurl.DEBUGFUNCTION, self.test)
c.setopt(pycurl.NOBODY, True)
c.setopt(pycurl.HEADER, False)
ftp_file_path = os.path.join(self.server, os.path.basename(full_filename))
file_hash = get_file_hash(full_filename)
ftp_hash_file = ftp_file_path + "_%s" % file_hash
# Getting filesize if exist on server.
try:
c.setopt(pycurl.URL, ftp_file_path)
c.perform()
filesize_offset = int(c.getinfo(pycurl.CONTENT_LENGTH_DOWNLOAD))
except Exception as error_msg:
print error_msg
wx.MessageBox(str(error_msg), 'Connection error!',
wx.OK | wx.ICON_ERROR)
# Exit upload function.
return True
ftp_file_append = True
# Get ftp file hash.
self.get_ftp_file_hash(filename)
offset = filesize_offset == -1 and '0' or filesize_offset
sys.stderr.write("L_file hash : {0:.<60}: {1:<40}\n".format(filename, file_hash))
sys.stderr.write("F_file hash : {0:.<60}: {1:<40}\n".format(filename, self.ftp_filehash))
sys.stderr.write("{0:15} : {1:.>15}\n".format('filesize:', filesize))
sys.stderr.write("{0:15} : {1:.>15}\n".format('ftp_filesize', offset))
sys.stderr.write("{0:15} : {1:.>15}\n".format('to upload:', filesize - int(offset)))
# File not exist on FTP server.
if filesize_offset == -1:
# file not exist: uploading from offset zero.
ftp_file_append = False
filesize_offset = 0
# Local and FTP file size and files MD5 are the same.
elif filesize_offset == self.filesize and file_hash == self.ftp_filehash:
sys.stderr.write("--- File exist on server! ---\n\n")
self.upload_file_speed.SetLabel("File exist on server!")
self.sb_speed.SetStatusText("File exist on server!", 1)
# Check next filename.
return False
# Ftp file and local file different data.
elif file_hash != self.ftp_filehash:
ftp_file_append = False
filesize_offset = 0
ftp_hash_file_old = filename + "_" + self.ftp_filehash
# delete old hash file.
self.delete_ftp_hash_file(ftp_hash_file_old)
c.setopt(pycurl.FTPAPPEND, ftp_file_append)
c.setopt(pycurl.UPLOAD, True)
c.setopt(pycurl.PROGRESSFUNCTION, self.progress)
with open('filehash.txt', 'w') as f:
f.write(file_hash)
for item in ("filehash.txt", full_filename):
# dont show progress by default.
noprogress = True
# upload ftp_hash_file first.
ftp_url = ftp_hash_file
with open(item, "rb") as f:
# chages ftp_url and show progress values, add filesize_offset.
if item != "filehash.txt":
f.seek(filesize_offset)
noprogress = False
ftp_url = ftp_file_path
c.setopt(pycurl.URL, ftp_url)
c.setopt(pycurl.NOPROGRESS, noprogress)
c.setopt(pycurl.READFUNCTION, f.read)
try:
c.perform()
if item != "filehash.txt":
sys.stderr.write("{0:15} : {1:.>15}\n\n".format("size uploaded", int(c.getinfo(pycurl.SIZE_UPLOAD))))
except Exception as error_msg:
print error_msg
wx.MessageBox(str(error_msg), 'Connection error!',
wx.OK | wx.ICON_ERROR)
# Exit upload function.
return True
self.ftp_filehash = '0'
c.close()
if __name__ == '__main__':
pass