Python cloud print authorization - python

I'm trying to use Google's app api to authorize my python program to query google-cloud-print queues. I'm using all of the information from https://developers.google.com/cloud-print/docs/pythonCode. After including my login, password, and client-id I made from the cloud app in google. I still get 404 errors. The Gaia method returns nothing in the token dictionary. Has anyone had experience with this? or with using their new OAuth2 system? I can't seem to find anything on google about this problem.
Here is my program with my login details redacted.
import base64
import httplib
import sys
import os
import time
import logging
import mimetools
import urllib
import urllib2
import optparse
import string
import ConfigParser
import json
CRLF = '\r\n'
BOUNDARY = mimetools.choose_boundary()
# The following are used for authentication functions.
FOLLOWUP_HOST = 'www.google.com/cloudprint'
FOLLOWUP_URI = 'select%2Fgaiaauth'
GAIA_HOST = 'www.google.com'
LOGIN_URI = '/accounts/ServiceLoginAuth'
LOGIN_URL = 'https://www.google.com/accounts/ClientLogin'
SERVICE = 'cloudprint'
OAUTH = '175351968146.apps.googleusercontent.com'
# The following are used for general backend access.
CLOUDPRINT_URL = 'http://www.google.com/cloudprint'
# CLIENT_NAME should be some string identifier for the client you are writing.
CLIENT_NAME = 'google-cloud-print'
# The following object is used in the sample code, but is not necessary.
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def EncodeMultiPart(fields, files, file_type='application/xml'):
"""Encodes list of parameters and files for HTTP multipart format.
Args:
fields: list of tuples containing name and value of parameters.
files: list of tuples containing param name, filename, and file contents.
file_type: string if file type different than application/xml.
Returns:
A string to be sent as data for the HTTP post request.
"""
lines = []
for (key, value) in fields:
lines.append('--' + BOUNDARY)
lines.append('Content-Disposition: form-data; name="%s"' % key)
lines.append('') # blank line
lines.append(value)
for (key, filename, value) in files:
lines.append('--' + BOUNDARY)
lines.append(
'Content-Disposition: form-data; name="%s"; filename="%s"'
% (key, filename))
lines.append('Content-Type: %s' % file_type)
lines.append('') # blank line
lines.append(value)
lines.append('--' + BOUNDARY + '--')
lines.append('') # blank line
return CRLF.join(lines)
def GetUrl(url, tokens, data=None, cookies=False, anonymous=False):
"""Get URL, with GET or POST depending data, adds Authorization header.
Args:
url: Url to access.
tokens: dictionary of authentication tokens for specific user.
data: If a POST request, data to be sent with the request.
cookies: boolean, True = send authentication tokens in cookie headers.
anonymous: boolean, True = do not send login credentials.
Returns:
String: response to the HTTP request.
"""
request = urllib2.Request(url)
if not anonymous:
if cookies:
logger.debug('Adding authentication credentials to cookie header')
request.add_header('Cookie', 'SID=%s; HSID=%s; SSID=%s' % (
tokens['SID'], tokens['HSID'], tokens['SSID']))
else: # Don't add Auth headers when using Cookie header with auth tokens.
request.add_header('Authorization', 'GoogleLogin auth=%s' % tokens['Auth'])
request.add_header('X-CloudPrint-Proxy', 'api-prober')
if data:
request.add_data(data)
request.add_header('Content-Length', str(len(data)))
request.add_header('Content-Type', 'multipart/form-data;boundary=%s' % BOUNDARY)
# In case the gateway is not responding, we'll retry.
retry_count = 0
while retry_count < 5:
try:
result = urllib2.urlopen(request).read()
return result
except urllib2.HTTPError, e:
# We see this error if the site goes down. We need to pause and retry.
err_msg = 'Error accessing %s\n%s' % (url, e)
logger.error(err_msg)
logger.info('Pausing %d seconds', 60)
time.sleep(60)
retry_count += 1
if retry_count == 5:
return err_msg
def GetCookie(cookie_key, cookie_string):
"""Extract the cookie value from a set-cookie string.
Args:
cookie_key: string, cookie identifier.
cookie_string: string, from a set-cookie command.
Returns:
string, value of cookie.
"""
logger.debug('Getting cookie from %s', cookie_string)
id_string = cookie_key + '='
cookie_crumbs = cookie_string.split(';')
for c in cookie_crumbs:
if id_string in c:
cookie = c.split(id_string)
return cookie[1]
return None
def ConvertJson(json_str):
"""Convert json string to a python object.
Args:
json_str: string, json response.
Returns:
dictionary of deserialized json string.
"""
j = {}
try:
j = json.loads(json_str)
j['json'] = True
except ValueError, e:
# This means the format from json_str is probably bad.
logger.error('Error parsing json string %s\n%s', json_str, e)
j['json'] = False
j['error'] = e
return j
def GetKeyValue(line, sep=':'):
"""Return value from a key value pair string.
Args:
line: string containing key value pair.
sep: separator of key and value.
Returns:
string: value from key value string.
"""
s = line.split(sep)
return StripPunc(s[1])
def StripPunc(s):
"""Strip puncuation from string, except for - sign.
Args:
s: string.
Returns:
string with puncuation removed.
"""
for c in string.punctuation:
if c == '-': # Could be negative number, so don't remove '-'.
continue
else:
s = s.replace(c, '')
return s.strip()
def Validate(response):
"""Determine if JSON response indicated success."""
if response.find('"success": true') > 0:
return True
else:
return False
def GetMessage(response):
"""Extract the API message from a Cloud Print API json response.
Args:
response: json response from API request.
Returns:
string: message content in json response.
"""
lines = response.split('\n')
for line in lines:
if '"message":' in line:
msg = line.split(':')
return msg[1]
return None
def ReadFile(pathname):
"""Read contents of a file and return content.
Args:
pathname: string, (path)name of file.
Returns:
string: contents of file.
"""
try:
f = open(pathname, 'rb')
try:
s = f.read()
except IOError, e:
logger('Error reading %s\n%s', pathname, e)
finally:
f.close()
return s
except IOError, e:
logger.error('Error opening %s\n%s', pathname, e)
return None
def WriteFile(file_name, data):
"""Write contents of data to a file_name.
Args:
file_name: string, (path)name of file.
data: string, contents to write to file.
Returns:
boolean: True = success, False = errors.
"""
status = True
try:
f = open(file_name, 'wb')
try:
f.write(data)
except IOError, e:
logger.error('Error writing %s\n%s', file_name, e)
status = False
finally:
f.close()
except IOError, e:
logger.error('Error opening %s\n%s', file_name, e)
status = False
return status
def Base64Encode(pathname):
"""Convert a file to a base64 encoded file.
Args:
pathname: path name of file to base64 encode..
Returns:
string, name of base64 encoded file.
For more info on data urls, see:
http://en.wikipedia.org/wiki/Data_URI_scheme
"""
b64_pathname = pathname + '.b64'
file_type = mimetypes.guess_type(pathname)[0] or 'application/octet-stream'
data = ReadFile(pathname)
# Convert binary data to base64 encoded data.
header = 'data:%s;base64,' % file_type
b64data = header + base64.b64encode(data)
if WriteFile(b64_pathname, b64data):
return b64_pathname
else:
return None
def GaiaLogin(email, password):
"""Login to gaia using HTTP post to the gaia login page.
Args:
email: string,
password: string
Returns:
dictionary of authentication tokens.
"""
tokens = {}
cookie_keys = ['SID', 'LSID', 'HSID', 'SSID']
email = email.replace('+', '%2B')
# Needs to be some random string.
galx_cookie = base64.b64encode('%s%s' % (email, time.time()))
# Simulate submitting a gaia login form.
form = ('ltmpl=login&fpui=1&rm=hide&hl=en-US&alwf=true'
'&continue=https%%3A%%2F%%2F%s%%2F%s'
'&followup=https%%3A%%2F%%2F%s%%2F%s'
'&service=%s&Email=%s&Passwd=%s&GALX=%s' % (FOLLOWUP_HOST,
FOLLOWUP_URI, FOLLOWUP_HOST, FOLLOWUP_URI, SERVICE, email,
password, galx_cookie))
login = httplib.HTTPS(GAIA_HOST, 443)
login.putrequest('POST', LOGIN_URI)
login.putheader('Host', GAIA_HOST)
login.putheader('content-type', 'application/x-www-form-urlencoded')
login.putheader('content-length', str(len(form)))
login.putheader('Cookie', 'GALX=%s' % galx_cookie)
logger.info('Sent POST content: %s', form)
login.endheaders()
logger.info('HTTP POST to https://%s%s', GAIA_HOST, LOGIN_URI)
login.send(form)
(errcode, errmsg, headers) = login.getreply()
login_output = login.getfile()
logger.info(headers)
login_output.close()
login.close()
logger.info('Login complete.')
if errcode != 302:
logger.error('Gaia HTTP post returned %d, expected 302', errcode)
logger.error('Message: %s', errmsg)
for line in str(headers).split('\r\n'):
if not line: continue
(name, content) = line.split(':', 1)
if name.lower() == 'set-cookie':
for k in cookie_keys:
if content.strip().startswith(k):
tokens[k] = GetCookie(k, content)
if not tokens:
logger.error('No cookies received, check post parameters.')
return None
else:
logger.debug('Received the following authorization tokens.')
for t in tokens:
logger.debug(t)
return tokens
def GetAuthTokens(email, password):
"""Assign login credentials from GAIA accounts service.
Args:
email: Email address of the Google account to use.
password: Cleartext password of the email account.
Returns:
dictionary containing Auth token.
"""
# First get GAIA login credentials using our GaiaLogin method.
logger.debug("GetAuthTokens")
tokens = GaiaLogin(email, password)
print tokens
if tokens:
# We still need to get the Auth token.
params = {'accountType': 'GOOGLE',
'Email': email,
'Passwd': password,
'service': SERVICE,
'source': CLIENT_NAME}
stream = urllib.urlopen(LOGIN_URL, urllib.urlencode(params))
for line in stream:
if line.strip().startswith('Auth='):
tokens['Auth'] = line.strip().replace('Auth=', '')
# All of the calls to GetUrl assume you've run something like this:
tokens = GetAuthTokens('email', 'password')
All of this code is straight from the google-cloud-print developer site.
Here is the last bit of the output.
INFO:__main__:Login complete.
ERROR:__main__:Gaia HTTP post returned 404, expected 302
ERROR:__main__:Message: Not Found
ERROR:__main__:No cookies received, check post parameters.
None
Thanks in advance!

replace this code
GAIA_HOST = 'www.google.com'
LOGIN_URI = '/accounts/ServiceLoginAuth'
by this
GAIA_HOST = 'accounts.google.com'
LOGIN_URI = '/ServiceLoginAuth'

I had the same problem.
If you wish you can use a simple library and command line program I just published.
https://github.com/escube/GoogleCloudSpooler

Related

How to get emails received from gmail python?

I want to get the last 10 received gmails with python.
Currently I have this code but it only returns a limited number of emails and it manipulates pop3 directly, which makes it unnecessary long.
Source of the code: https://www.code-learner.com/python-use-pop3-to-read-email-example/
import poplib
import smtplib, ssl
def guess_charset(msg):
# get charset from message object.
charset = msg.get_charset()
# if can not get charset
if charset is None:
# get message header content-type value and retrieve the charset from the value.
content_type = msg.get('Content-Type', '').lower()
pos = content_type.find('charset=')
if pos >= 0:
charset = content_type[pos + 8:].strip()
return charset
def decode_str(s):
value, charset = decode_header(s)[0]
if charset:
value = value.decode(charset)
return value
# variable indent_number is used to decide number of indent of each level in the mail multiple bory part.
def print_info(msg, indent_number=0):
if indent_number == 0:
# loop to retrieve from, to, subject from email header.
for header in ['From', 'To', 'Subject']:
# get header value
value = msg.get(header, '')
if value:
# for subject header.
if header=='Subject':
# decode the subject value
value = decode_str(value)
# for from and to header.
else:
# parse email address
hdr, addr = parseaddr(value)
# decode the name value.
name = decode_str(hdr)
value = u'%s <%s>' % (name, addr)
print('%s%s: %s' % (' ' * indent_number, header, value))
# if message has multiple part.
if (msg.is_multipart()):
# get multiple parts from message body.
parts = msg.get_payload()
# loop for each part
for n, part in enumerate(parts):
print('%spart %s' % (' ' * indent_number, n))
print('%s--------------------' % (' ' * indent_number))
# print multiple part information by invoke print_info function recursively.
print_info(part, indent_number + 1)
# if not multiple part.
else:
# get message content mime type
content_type = msg.get_content_type()
# if plain text or html content type.
if content_type=='text/plain' or content_type=='text/html':
# get email content
content = msg.get_payload(decode=True)
# get content string charset
charset = guess_charset(msg)
# decode the content with charset if provided.
if charset:
content = content.decode(charset)
print('%sText: %s' % (' ' * indent_number, content + '...'))
else:
print('%sAttachment: %s' % (' ' * indent_number, content_type))
# input email address, password and pop3 server domain or ip address
email = 'yourgmail#gmail.com'
password = 'yourpassword'
# connect to pop3 server:
server = poplib.POP3_SSL('pop.gmail.com')
# open debug switch to print debug information between client and pop3 server.
server.set_debuglevel(1)
# get pop3 server welcome message.
pop3_server_welcome_msg = server.getwelcome().decode('utf-8')
# print out the pop3 server welcome message.
print(server.getwelcome().decode('utf-8'))
# user account authentication
server.user(email)
server.pass_(password)
# stat() function return email count and occupied disk size
print('Messages: %s. Size: %s' % server.stat())
# list() function return all email list
resp, mails, octets = server.list()
print(mails)
# retrieve the newest email index number
#index = len(mails)
index = 3
# server.retr function can get the contents of the email with index variable value index number.
resp, lines, octets = server.retr(index)
# lines stores each line of the original text of the message
# so that you can get the original text of the entire message use the join function and lines variable.
msg_content = b'\r\n'.join(lines).decode('utf-8')
# now parse out the email object.
from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr
import poplib
# parse the email content to a message object.
msg = Parser().parsestr(msg_content)
print(len(msg_content))
# get email from, to, subject attribute value.
email_from = msg.get('From')
email_to = msg.get('To')
email_subject = msg.get('Subject')
print('From ' + email_from)
print('To ' + email_to)
print('Subject ' + email_subject)
for part in msg.walk():
if part.get_content_type():
body = part.get_payload(decode=True)
print_info(msg, len(msg))
# delete the email from pop3 server directly by email index.
# server.dele(index)
# close pop3 server connection.
server.quit()
I also tried this code but it didn't work:
import imaplib, email, base64
def fetch_messages(username, password):
messages = []
conn = imaplib.IMAP4_SSL("imap.gmail.com", 993)
conn.login(username, password)
conn.select()
typ, data = conn.uid('search', None, 'ALL')
for num in data[0].split():
typ, msg_data = conn.uid('fetch', num, '(RFC822)')
for response_part in msg_data:
if isinstance(response_part, tuple):
messages.append(email.message_from_string(response_part[1]))
typ, response = conn.store(num, '+FLAGS', r'(\Seen)')
return messages
and this also didn't work for me...
import poplib
from email import parser
pop_conn = poplib.POP3_SSL('pop.gmail.com')
pop_conn.user('#gmail.com')
pop_conn.pass_('password')
messages = [pop_conn.retr(i) for i in range(1, len(pop_conn.list()[1]) + 1)]
# Concat message pieces:
messages = ["\n".join(mssg[1]) for mssg in messages]
#Parse message intom an email object:
messages = [parser.Parser().parsestr(mssg) for mssg in messages]
for message in messages:
print(message['subject'])
print(message['body'])
I managed to solve it, the only issue is that it marks as read every unread email, here is the code I used:
import imaplib
mail = imaplib.IMAP4_SSL('imap.gmail.com')
email = input('Email: ')
password = input('Password: ')
mail.login(email+'#gmail.com', password)
mail.list()
# Out: list of "folders" aka labels in gmail.
mail.select("inbox") # connect to inbox.
result, data = mail.search(None, "ALL")
ids = data[0] # data is a list.
id_list = ids.split() # ids is a space separated string
latest_email_id = id_list[-1] # get the latest
# fetch the email body (RFC822) for the given ID
result, data = mail.fetch(latest_email_id, "(RFC822)")
raw_email = data[0][1] # here's the body, which is raw text of the whole email
# including headers and alternate payloads
import email
email_message = email.message_from_string(str(raw_email))
print (email_message['To'])
print (email.utils.parseaddr(email_message['From'])) # for parsing "Yuji Tomita" <yuji#grovemade.com>
print (email_message.items()) # print all headers
# note that if you want to get text content (body) and the email contains
# multiple payloads (plaintext/ html), you must parse each message separately.
# use something like the following: (taken from a stackoverflow post)
def get_first_text_block(self, email_message_instance):
maintype = email_message_instance.get_content_maintype()
if maintype == 'multipart':
for part in email_message_instance.get_payload():
if part.get_content_maintype() == 'text':
return part.get_payload()
elif maintype == 'text':
return email_message_instance.get_payload()
https://developers.google.com/gmail/api/quickstart/python is the preferred way:
from gmail.gmail import gmail_auth, ListThreadsMatchingQuery
service = gmail_auth()
threads = ListThreadsMatchingQuery(service, query=query)
where:
def ListThreadsMatchingQuery(service, user_id='me', query=''):
"""List all Threads of the user's mailbox matching the query.
Args:
service: Authorized Gmail API service instance.
user_id: User's email address. The special value "me"
can be used to indicate the authenticated user.
query: String used to filter messages returned.
Eg.- 'label:UNREAD' for unread messages only.
Returns:
List of threads that match the criteria of the query. Note that the returned
list contains Thread IDs, you must use get with the appropriate
ID to get the details for a Thread.
"""
try:
response = service.users().threads().list(userId=user_id, q=query).execute()
threads = []
if 'threads' in response:
threads.extend(response['threads'])
while 'nextPageToken' in response:
page_token = response['nextPageToken']
response = service.users().threads().list(userId=user_id, q=query,
pageToken=page_token).execute()
threads.extend(response['threads'])
return threads
except errors.HttpError as error:
raise error
You should try easyimap lib to get a list of e-mails, I'm not sure if works with pop3.
Code example:
import easyimap
host = 'imap.gmail.com'
user = 'you#example.com'
password = 'secret'
mailbox = 'INBOX.subfolder'
imapper = easyimap.connect(host, user, password, mailbox)
email_quantity = 10
emails_from_your_mailbox = imapper.listids(limit=email_quantity)
imapper.quit()

Get e-mail recieved date of IMAP using Python

I'm using the following code (got it from StackOverflow :)) to get all unread e-mail from specific email adresses. It works perfect!
I would however like to get the actual recived (or sent) date for each e-mail I'm getting an attached file from. But I dont know how to do that?
import email
import imaplib
import os
import sys
import random
import string
import glob
import unicodedata
def remove_accents(s):
nkfd_form = unicodedata.normalize('NFKD', s)
return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
def remove_non_ascii(text):
return unidecode(unicode(text, encoding = "cp865"))
def replace_non_ascii(x): return ''.join(i if ord(i) < 128 else '_' for i in x)
detach_dir = r'\\1.1.1.1\xxx\xxx\drop_folder'
try:
imapSession = imaplib.IMAP4_SSL('outlook.office365.com')
typ, accountDetails = imapSession.login('xxxx', 'xxxx')
if typ != 'OK':
print ('Not able to sign in!')
raise
imapSession.select('Inbox')
typ, data = imapSession.search(None, '(UNSEEN FROM "#xxxx.xxx")')
if typ != 'OK':
print ('Error searching Inbox.')
raise
# Iterating over all emails
for msgId in data[0].split():
typ, messageParts = imapSession.fetch(msgId, '(RFC822)')
if typ != 'OK':
print ('Error fetching mail.')
raise
emailBody = messageParts[0][1]
mail = email.message_from_string(emailBody)
for part in mail.walk():
if part.get_content_maintype() == 'multipart':
# print part.as_string()
continue
if part.get('Content-Disposition') is None:
# print part.as_string()
continue
fileName = part.get_filename().encode('utf-8')
if bool(fileName):
filePath = os.path.join(detach_dir, 'EXP-' + fileName + '.xls')
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
imapSession.close()
imapSession.logout()
except :
print ('Not able to download all attachments.')
You could fetch the INTERNALDATE instead of, or in addition to the RFC822 item; It is (generally) time the server received the message.
You will have to do some parsing of the return item, since imaplib does no parsing of FETCH results. It will be easier to parse if it's the only thing you fetch.
The response will look something like
* 5 FETCH (INTERNALDATE "17-Jul-2018 02:44:25 -0700")

Why h.getresponse() is needed in python logging HTTPHandler?

I overrided the method emit of python logging httphandler to adapt it to my needs, and I noticed the line
h.getresponse() #can't do anything with the result
Why is this line necessary?
I noticed that removing this line has no effect when using unsecure logging, but makes the logs fail when using secure connection.
def emit(self, record):
"""
Emit a record.
Send the record to the Web server as a percent-encoded dictionary
"""
try:
import http.client, urllib.parse
host = self.host
if self.secure:
h = http.client.HTTPSConnection(host, context=self.context)
else:
h = http.client.HTTPConnection(host)
url = self.url
data = urllib.parse.urlencode(self.mapLogRecord(record))
if self.method == "GET":
if (url.find('?') >= 0):
sep = '&'
else:
sep = '?'
url = url + "%c%s" % (sep, data)
h.putrequest(self.method, url)
# support multiple hosts on one IP address...
# need to strip optional :port from host, if present
i = host.find(":")
if i >= 0:
host = host[:i]
# See issue #30904: putrequest call above already adds this header
# on Python 3.x.
# h.putheader("Host", host)
if self.method == "POST":
h.putheader("Content-type",
"application/x-www-form-urlencoded")
h.putheader("Content-length", str(len(data)))
if self.credentials:
import base64
s = ('%s:%s' % self.credentials).encode('utf-8')
s = 'Basic ' + base64.b64encode(s).strip().decode('ascii')
h.putheader('Authorization', s)
h.endheaders()
if self.method == "POST":
h.send(data.encode('utf-8'))
h.getresponse() #can't do anything with the result
except Exception:
self.handleError(record)
The getresponse() call guarantees that the request is actually sent to the server by getting the response to the request.

Python - How to extract URLs (plain/html, quote-printable/base64/7bit) from an email file [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
This question does not appear to be about programming within the scope defined in the help center.
Closed 2 years ago.
Improve this question
I have searched in many places but I haven't come across a logic/script that extracts the URLs from the emails properly. So, I am presenting what I came up with. This is working perfectly for me.
This can handle plain-text and html-text content-types, supports quoted-printable, base64, and 7bit encodings.
NOTE: I wrote this as part of another task, you may have to tweak it to suit your need. Post any questions, and I can help answer.
Modules to import for this to work:
import traceback
import BeautifulSoup
import re
from sets import Set
import email
import quopri, base64
Here are the APIs I wrote that will do this job:
def decode_quote_printable_part(self, quo_pri_part):
"""
Decodes a quote-printable encoded MIME object
:param quo_pri_part: MIME msg part
:return: decoded text, null if exception
"""
try:
quo_pri_payload = quo_pri_part.get_payload()
return quopri.decodestring(quo_pri_payload)
except Exception as err:
print "ERROR - Exception when decoding quoted printable: %s" % err
return ""
def decode_base64_part(self, base64_part):
"""
Decodes base64 encoded MIME object
:param base64_part: MIME msg part
:return: decoded text, null if exception
"""
try:
decoded_part = base64.b64decode(base64_part)
return decoded_part
except Exception as err:
print "ERROR - Exception when decoding base64 part: %s" % err
return ""
def get_urls_from_html_part(self, html_code):
"""
Parses the given HTML text and extracts the href links from it.
The input should already be decoded
:param html_code: Decoded html text
:return: A list of href links (includes mailto: links as well), null list if exception
"""
try:
soup = BeautifulSoup.BeautifulSoup(html_code)
html_urls = []
for link in soup.findAll("a"):
url = link.get("href")
if url and "http" in url:
html_urls.append(url)
return html_urls
except Exception as err:
print "ERROR - Exception when parsing the html body: %s" % err
return []
def get_urls_from_plain_part(self, email_data):
"""
Parses the given plain text and extracts the URLs out of it
:param email_data: plain text to parse
:return: A list of URLs (deduplicated), a null list if exception
"""
try:
pattern = "abcdefghijklmnopqrstuvwxyz0123456789./\~#%&()_-+=;?:[]!$*,#'^`<{|\""
indices = [m.start() for m in re.finditer('http://', email_data)]
indices.extend([n.start() for n in re.finditer('https://', email_data)])
urls = []
if indices:
if len(indices) > 1:
new_lst = zip(indices, indices[1:])
for x, y in new_lst:
tmp = email_data[x:y]
url = ""
for ch in tmp:
if ch.lower() in pattern:
url += ch
else:
break
urls.append(url)
tmp = email_data[indices[-1]:]
url = ""
for ch in tmp:
if ch.lower() in pattern:
url += ch
else:
break
urls.append(url)
urls = list(Set(urls))
return urls
return []
except Exception as err:
print "ERROR - Exception when parsing plain text for urls: %s" % err
return []
def get_urls_list(self, msg):
"""
Collects all the URLs from an email
:param msg: email message object
:return: A dictionary of URLs => final_urls = {'http': [], 'https': []}
"""
urls = []
for part in msg.walk():
decoded_part = part.get_payload()
if part.__getitem__("Content-Transfer-Encoding") == "quoted-printable":
decoded_part = self.decode_quote_printable_part(part)
elif part.__getitem__("Content-Transfer-Encoding") == "base64":
decoded_part = self.decode_base64_part(part.get_payload())
if part.get_content_subtype() == "plain":
urls.extend(self.get_urls_from_plain_part(decoded_part))
elif part.get_content_subtype() == "html":
urls.extend(self.get_urls_from_html_part(decoded_part))
final_urls = {'http': [], 'https': []}
for url in urls:
if "http://" in url:
final_urls['http'].append(url)
else:
final_urls['https'].append(url)
return final_urls
Here is how to call this API:
try:
with open(filename, 'r') as f:
data = f.read()
msg = email.message_from_string(data)
final_urls = self.get_urls_list(msg)
except:
pass

unable to detect request content_type

On a django server, I process uploaded zip files sent from a python script. But I am getting "" (a blank string) for file.content_type. What am I doing wrong?
#csrf_exempt
def Import( request ):
if request.method != 'POST':
return HttpResponseNotAllowed('Only POST here')
if not request.FILES or not request.FILES.get( u'file' ):
return HttpResponse('Must upload a file')
file = request.FILES[u'file']
if file.content_type == 'application/zip':
unzipped_dir = unzip_file( file )
uid = create_project( unzipped_dir )
shutil.rmtree( unzipped_dir )
py_ob = { }
py_ob['success'] = uid is not None
if uid is not None:
py_ob['id'] = uid
json_ob = simplejson.dumps(py_ob)
return HttpResponse( json_ob, mimetype="application/json" )
else:
return HttpResponseNotAllowed( 'Only POST zip files here' )
This is the script which sends the zip file up:
import sys
import os
import requests
if len (sys.argv) < 5:
print "pass in url, username, password, file"
else:
url = sys.argv[1]
username = sys.argv[2]
password = sys.argv[3]
phile = sys.argv[4]
if os.path.exists(phile):
files = {'file': open( phile, 'rb' )}
r = requests.post( url, files=files, auth=( username, password ) )
if r.status_code == 200:
json_response = r.json()
if json_response['success']:
print "id: " + str( json_response['id'] )
else:
print "failure in processing bundle"
else:
print "server problem: " + str(r.status_code)
print r.text
else:
print "cannot find file to upload"
The Content-Type header is completely arbitrary (and optional) and not a good way to detect whether or not you're dealing with a valid ZIP file. Have you made sure your browser is supplying it?
Django's documentation tells us the same:
UploadedFile.content_type
The content-type header uploaded with the file (e.g. text/plain or application/pdf). Like any data supplied by the user, you shouldn’t
trust that the uploaded file is actually this type. You’ll still need
to validate that the file contains the content that the content-type
header claims – “trust but verify.”
You should be using zipfile.is_zipfile instead.

Categories