I need a python script that gets the google adsense earnings and I found adsense scraper:
http://pypi.python.org/pypi/adsense_scraper/0.5
It uses Twill and html5lib to scrape google adsense earnings data. When I use it I get this error message:
Traceback (most recent call last):
File "adsense_scraper.py", line 163, in <module>
data = main()
File "adsense_scraper.py", line 154, in main
b = get_adsense(login, password)
File "adsense_scraper.py", line 128, in get_adsense
b.submit()
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
self._journey('open', request)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
r = func(*args, **kwargs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
"http", request, response, code, msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
response = meth(req, response)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
"refresh", msg, hdrs)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
result = apply(self._call_chain, args)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
return self.parent.open(new)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
return self._mech_open(url, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
response = UserAgentBase.open(self, request, data)
File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
response = urlopen(self, req, data)
File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
So the important thing is:
urllib2.URLError: <urlopen error unknown url type: 'http>
Can somebody tell me where the error is? Is there even a better way to get the data via python? Thanks
there are several errors with the package, you mentioned only the first one
1) twill package does not handle google's redirects correctly, adding
newurl = newurl.strip( "'" )
to twill/other_packages/_mechanize_dist/_http.py:108 before
newurl = _rfc3986.clean_url(newurl, "latin-1")
fixes that
2) you have to have the correct language set in adsense - English
3) there are several problems in the orignal adsense_scraper
#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill
Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master
Usage::
from adsense_scraper import get_adsense, get_time_period
b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
rows = get_time_period(b, 'yesterday')
# The summary data is always the first row with channel == ''
print 'I earned this much yesterday: $%(earnings)s' % rows[0]
"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree
try:
from html5lib import HTMLParser
import twill.commands
except ImportError:
print >>sys.stderr, """\
adsense_scraper has dependencies::
Twill 0.9 http://twill.idyll.org/
html5lib 0.11 http://code.google.com/p/html5lib/
Try this::
$ easy_install twill html5lib
"""
raise SystemExit()
__version__ = '0.5'
SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true<mpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="
TIME_PERIODS = [
'today',
'yesterday',
'thismonth',
'lastmonth',
'sincelastpayment',
]
def parse_decimal(s):
"""Return an int or decimal.Decimal given a human-readable number
"""
light_stripped = s.strip(u'\u20ac')
stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('$')
try:
int(stripped)
return light_stripped
except ValueError:
pass
try:
float(stripped)
return light_stripped
except ValueError:
return decimal.Decimal(stripped)
def parse_summary_table(doc):
"""
Parse the etree doc for summarytable, returns::
[{'channel': unicode,
'impressions': int,
'clicks': int,
'ctr': decimal.Decimal,
'ecpm': decimal.Decimal,
'earnings': decimal.Decimal}]
"""
for t in doc.findall('.//table'):
if t.attrib.get('id') == 'summarytable':
break
else:
raise ValueError("summary table not found")
res = []
FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
for row in t.findall('.//tr'):
celltext = []
for c in row.findall('td'):
tail = ''
# adsense inserts an empty span if a row has a period in it, so
# get the children and find the tail element to append to the text
if c.find('a') and c.find('a').getchildren():
tail = c.find('a').getchildren()[0].tail or ''
celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))
celltext = filter( lambda x: x != "" , celltext )
if len(celltext) != len(FIELDS):
continue
try:
value_cols = map(parse_decimal, celltext)
except decimal.InvalidOperation:
continue
res.append(dict(zip(FIELDS, value_cols)))
return res
def get_adsense(login, password):
"""Returns a twill browser instance after having logged in to AdSense
with *login* and *password*.
The returned browser will have all of the appropriate cookies set but may
not be at the exact page that you want data from.
"""
b = twill.commands.get_browser()
b.go(SERVICE_LOGIN_BOX_URL)
for form in b.get_all_forms():
try:
form['Email'] = login
form['Passwd'] = password
except ValueError:
continue
else:
break
else:
raise ValueError("Could not find login form on page")
b._browser.select_form(predicate=lambda f: f is form)
b.submit()
return b
def get_time_period(b, period):
"""Returns the parsed summarytable for the time period *period* given
*b* which should be the result of a get_adsense call. *period* must be
a time period that AdSense supports:
``'today'``, ``'yesterday'``, ``'thismonth'``,
``'lastmonth'``, ``'sincelastpayment'``.
"""
b.go(OVERVIEW_URL + period)
# The cElementTree treebuilder doesn't work reliably enough
# to use directly, so we parse and then dump into cElementTree.
doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
return parse_summary_table(doc)
def main():
try:
login, password = sys.argv[1:]
except ValueError:
raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
twill.set_output(StringIO())
twill.commands.reset_browser()
b = get_adsense(login, password)
data = {}
for period in TIME_PERIODS:
data[period] = get_time_period(b, period)
pprint.pprint(data)
twill.set_output(None)
return data
if __name__ == '__main__':
data = main()
Related
I am trying to implement the Amazon Web Scraper mentioned here. However, I get the output mentioned below. The output repeats until it stops with RecursionError: maximum recursion depth exceeded.
I have already tried downgrading eventlet to version 0.17.4 as mentioned here.
Also, the requestsmodule is getting patched as you can see in helpers.py.
helpers.py
import os
import random
from datetime import datetime
from urllib.parse import urlparse
import eventlet
requests = eventlet.import_patched('requests.__init__')
time = eventlet.import_patched('time')
import redis
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import settings
num_requests = 0
redis = redis.StrictRedis(host=settings.redis_host, port=settings.redis_port, db=settings.redis_db)
def make_request(url, return_soup=True):
# global request building and response handling
url = format_url(url)
if "picassoRedirect" in url:
return None # skip the redirect URLs
global num_requests
if num_requests >= settings.max_requests:
raise Exception("Reached the max number of requests: {}".format(settings.max_requests))
proxies = get_proxy()
try:
r = requests.get(url, headers=settings.headers, proxies=proxies)
except RequestException as e:
log("WARNING: Request for {} failed, trying again.".format(url))
num_requests += 1
if r.status_code != 200:
os.system('say "Got non-200 Response"')
log("WARNING: Got a {} status code for URL: {}".format(r.status_code, url))
return None
if return_soup:
return BeautifulSoup(r.text), r.text
return r
def format_url(url):
# make sure URLs aren't relative, and strip unnecssary query args
u = urlparse(url)
scheme = u.scheme or "https"
host = u.netloc or "www.amazon.de"
path = u.path
if not u.query:
query = ""
else:
query = "?"
for piece in u.query.split("&"):
k, v = piece.split("=")
if k in settings.allowed_params:
query += "{k}={v}&".format(**locals())
query = query[:-1]
return "{scheme}://{host}{path}{query}".format(**locals())
def log(msg):
# global logging function
if settings.log_stdout:
try:
print("{}: {}".format(datetime.now(), msg))
except UnicodeEncodeError:
pass # squash logging errors in case of non-ascii text
def get_proxy():
# choose a proxy server to use for this request, if we need one
if not settings.proxies or len(settings.proxies) == 0:
return None
proxy = random.choice(settings.proxies)
proxy_url = "socks5://{user}:{passwd}#{ip}:{port}/".format(
user=settings.proxy_user,
passwd=settings.proxy_pass,
ip=proxy,
port=settings.proxy_port,
)
return {
"http": proxy_url,
"https": proxy_url
}
if __name__ == '__main__':
# test proxy server IP masking
r = make_request('https://api.ipify.org?format=json', return_soup=False)
print(r.text)
output
Traceback (most recent call last):
File "helpers.py", line 112, in <module>
r = make_request('https://api.ipify.org?format=json', return_soup=False)
File "helpers.py", line 36, in make_request
r = requests.get(url, headers=settings.headers, proxies=proxies)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 672, in urlopen
chunked=chunked,
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connectionpool.py", line 994, in _validate_conn
conn.connect()
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/connection.py", line 300, in connect
conn = self._new_conn()
File "/home/ec2-user/env/lib64/python3.7/site-packages/urllib3/contrib/socks.py", line 99, in _new_conn
**extra_kw
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 199, in create_connection
sock.connect((remote_host, remote_port))
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 47, in wrapper
return function(*args, **kwargs)
File "/home/ec2-user/env/lib64/python3.7/site-packages/socks.py", line 774, in connect
super(socksocket, self).settimeout(self._timeout)
File "/home/ec2-user/env/lib64/python3.7/site-packages/eventlet/greenio/base.py", line 395, in settimeout
self.setblocking(True)
What might be the problem here?
Turns out removing eventlet.monkey_patch() and import eventlet solved the problem.
I'm trying to write a small python 3 utility script that checks to see if a file exists on my server.
So I have the code below that has a big array of string values that I pass to a simple function that returns the url and the response code.
However, when I run it I get all these errors I don't even know where to start:
$ python ReturnPath.py
Traceback (most recent call last):
File "ReturnPath.py", line 86, in <module>
checkResponse(u)
File "ReturnPath.py", line 5, in checkResponse
code = urllib.request.urlopen(url).getcode()
File "C:\Program Files\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\Python37\lib\urllib\request.py", line 510, in open
req = Request(fullurl, data)
File "C:\Program Files\Python37\lib\urllib\request.py", line 328, in __init__
self.full_url = url
File "C:\Program Files\Python37\lib\urllib\request.py", line 354, in full_url
self._parse()
File "C:\Program Files\Python37\lib\urllib\request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: '"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg"'
Here is my code:
import urllib.request
def checkResponse(url):
code = urllib.request.urlopen(url).getcode()
print(url + " = " + code)
return
arrCases = []
arrCases.extend([
"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg",
"https://myserver.org/Media/CharacterAvatarImages/3ea92fa3-1ef0-4358-b38d-bb04e653aa53.jpg",
"https://myserver.org/Media/CharacterAvatarImages/7958a0e3-171b-46b5-875e-970368389bdf.jpg",
"https://myserver.org/Media/CharacterAvatarImages/e9a6cb00-6811-4b47-9aac-88480578dd44.jpg",
"https://myserver.org/Media/CharacterAvatarImages/73df88c3-b829-4519-9523-2bbe1f2c8549.jpg",
"https://myserver.org/Media/CharacterAvatarImages/61aa614b-5c95-487c-b4e3-783231b43677.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8be7811f-18dc-4a81-a557-8b81605e3452.jpg",
"https://myserver.org/Media/CharacterAvatarImages/56539acb-2b1b-4410-a4bc-ac2eb0dc00fa.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8bcf93fc-b435-4fd4-9c82-4aba78c58529.jpg",
])
for u in arrCases:
checkResponse(u)
What am I doing wrong?
You have to catch errors from broken URLs. I also increased speed through multiprocessing.Pool.
import urllib.request
from urllib.error import HTTPError, URLError
import multiprocessing
def checkResponse(url):
try:
code = urllib.request.urlopen(url, timeout=1).getcode()
except (HTTPError, URLError) as error:
print(url, " = ", error)
else:
print(url, " = ", code)
return
arrCases = []
arrCases.extend([
"https://i.stack.imgur.com/DsNOB.jpg",
"https://myserver.org/Media/CharacterAvatarImages/ae275ecb-183e-4e8d-8465-9d6d36c1323f.jpg",
"https://myserver.org/Media/CharacterAvatarImages/3ea92fa3-1ef0-4358-b38d-bb04e653aa53.jpg",
"https://myserver.org/Media/CharacterAvatarImages/7958a0e3-171b-46b5-875e-970368389bdf.jpg",
"https://myserver.org/Media/CharacterAvatarImages/e9a6cb00-6811-4b47-9aac-88480578dd44.jpg",
"https://myserver.org/Media/CharacterAvatarImages/73df88c3-b829-4519-9523-2bbe1f2c8549.jpg",
"https://myserver.org/Media/CharacterAvatarImages/61aa614b-5c95-487c-b4e3-783231b43677.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8be7811f-18dc-4a81-a557-8b81605e3452.jpg",
"https://myserver.org/Media/CharacterAvatarImages/56539acb-2b1b-4410-a4bc-ac2eb0dc00fa.jpg",
"https://myserver.org/Media/CharacterAvatarImages/8bcf93fc-b435-4fd4-9c82-4aba78c58529.jpg",
])
with multiprocessing.Pool(processes=4) as pool:
pool.map(checkResponse, arrCases)
I am just a newly self-learned programmer. I was hoping to upload some numbers to my website by python But somewhat I failed. Could you help me figure out what is wrong?
Here is my original python code.
#!/usr/bin/python
import time
import urllib.request
import random
import datetime
from urllib.request import Request,urlopen
basic_web = 'http://ihome.ust.hk/~xxxxx/cgi-bin/datafile.php?'
message=""
while(True):
local_time= time.time()
web_x = basic_web
file1 = open("datalist1.txt", "r")
queue1 = file1.read()
file1.close()
web_x += "&queue1=" + queue1
file2 = open("datalist2.txt", "r")
queue2 = file2.read()
file2.close()
web_x += "&queue2=" + queue2
web_x += "&local_time=" + str (local_time)
print (web_x)
#req = Request (web_x)
#html = urlopen(req).read()
response = urllib.request.urlopen(web_x, timeout = 1)
html = response.read()
print(html)
time.sleep(0.1)
print ("hehe")
And here is the output error that I got:
Traceback (most recent call last):
File "C:\web bus stop\local\datauploader.py", line 25, in <module>
response = urllib.request.urlopen(web_x)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1243, in do_open
r = h.getresponse()
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1174, in getresponse
response.begin()
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 282, in begin
version, status, reason = self._read_status()
File "C:\Users\ad\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 264, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine: connected! queue1queue2 finish sir!
I would really appreciate it if you guys could help me figure out what is the bug.
Never mind.
I changed a computer to run it and it worked now.
I'm writing code for a Django-based static blog, but I am coming across this similar issue across 3 or 4 different areas of my code. I figured if I can get one fixed then I can get the others fixed as well. My code of focus will be a django-command that I call update_blog1. Here's the traceback...
Traceback (most recent call last):
File "C:\Python34\lib\site-packages\django\core\handlers\base.py", line 132, in get_response
response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "C:/Users/Jaysp_000/firstSite/PROJECTone\blog_static\views.py", line 179, in archive
{'posts' : posts}
File "C:/Users/Jaysp_000/firstSite/PROJECTone\blog_static\views.py", line 14, in render_response
return render_to_response(*args, **kwargs)
File "C:\Python34\lib\site-packages\django\shortcuts.py", line 45, in render_to_response using=using)
File "C:\Python34\lib\site-packages\django\template\loader.py", line 116, in render_to_string
template_name, context, context_instance, dirs, dictionary)
File "C:\Python34\lib\site-packages\django\template\engine.py", line 221, in render_to_string
return t.render(context_instance)
File "C:\Python34\lib\site-packages\django\template\base.py", line 208, in render
with context.bind_template(self):
File "C:\Python34\lib\contextlib.py", line 59, in __enter__
return next(self.gen)
File "C:\Python34\lib\site-packages\django\template\context.py", line 235, in bind_template
updates.update(processor(self.request))
File "C:\Python34\lib\site-packages\django\template\context_processors.py", line 56, in i18n
context_extras['LANGUAGE_BIDI'] = translation.get_language_bidi()
File "C:\Python34\lib\site-packages\django\utils\translation\__init__.py", line 177, in get_language_bidi
return _trans.get_language_bidi()
File "C:\Python34\lib\site-packages\django\utils\translation\trans_real.py", line 263, in get_language_bidi
base_lang = get_language().split('-')[0]
AttributeError: 'NoneType' object has no attribute 'split'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python34\lib\site-packages\django\utils\formats.py", line 103, in get_format
cached = _format_cache[cache_key]
KeyError: ('r', None)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Program Files (x86)\JetBrains\PyCharm 4.5.3\helpers\pycharm\django_manage.py", line 41, in <module>
run_module(manage_file, None, '__main__', True)
File "C:\Python34\lib\runpy.py", line 182, in run_module
return _run_module_code(code, init_globals, run_name, mod_spec)
File "C:\Python34\lib\runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "C:\Python34\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:/Users/Jaysp_000/firstSite/PROJECTone\manage.py", line 10, in <module>
execute_from_command_line(sys.argv)
File "C:\Python34\lib\site-packages\django\core\management\__init__.py", line 338, in execute_from_command_line
utility.execute()
File "C:\Python34\lib\site-packages\django\core\management\__init__.py", line 330, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "C:\Python34\lib\site-packages\django\core\management\base.py", line 390, in run_from_argv
self.execute(*args, **cmd_options)
File "C:\Python34\lib\site-packages\django\core\management\base.py", line 441, in execute
output = self.handle(*args, **options)
File "C:/Users/Jaysp_000/firstSite/PROJECTone\blog_static\management\commands\update_blog1.py", line 78, in handle
resp = client.get(path)
File "C:\Python34\lib\site-packages\django\test\client.py", line 500, in get
**extra)
File "C:\Python34\lib\site-packages\django\test\client.py", line 303, in get
return self.generic('GET', path, secure=secure, **r)
File "C:\Python34\lib\site-packages\django\test\client.py", line 379, in generic
return self.request(**r)
File "C:\Python34\lib\site-packages\django\test\client.py", line 448, in request
response = self.handler(environ)
File "C:\Python34\lib\site-packages\django\test\client.py", line 122, in __call__
response = self.get_response(request)
File "C:\Python34\lib\site-packages\django\core\handlers\base.py", line 218, in get_response
response = self.handle_uncaught_exception(request, resolver, sys.exc_info())
File "C:\Python34\lib\site-packages\django\core\handlers\base.py", line 261, in handle_uncaught_exception
return debug.technical_500_response(request, *exc_info)
File "C:\Python34\lib\site-packages\django\views\debug.py", line 97, in technical_500_response
html = reporter.get_traceback_html()
File "C:\Python34\lib\site-packages\django\views\debug.py", line 384, in get_traceback_html
return t.render(c)
File "C:\Python34\lib\site-packages\django\template\base.py", line 209, in render
return self._render(context)
File "C:\Python34\lib\site-packages\django\template\base.py", line 201, in _render
return self.nodelist.render(context)
File "C:\Python34\lib\site-packages\django\template\base.py", line 903, in render
bit = self.render_node(node, context)
File "C:\Python34\lib\site-packages\django\template\debug.py", line 79, in render_node
return node.render(context)
File "C:\Python34\lib\site-packages\django\template\debug.py", line 89, in render
output = self.filter_expression.resolve(context)
File "C:\Python34\lib\site-packages\django\template\base.py", line 674, in resolve
new_obj = func(obj, *arg_vals)
File "C:\Python34\lib\site-packages\django\template\defaultfilters.py", line 771, in date
return formats.date_format(value, arg)
File "C:\Python34\lib\site-packages\django\utils\formats.py", line 136, in date_format
return dateformat.format(value, get_format(format or 'DATE_FORMAT', use_l10n=use_l10n))
File "C:\Python34\lib\site-packages\django\utils\formats.py", line 110, in get_format
for module in get_format_modules(lang):
File "C:\Python34\lib\site-packages\django\utils\formats.py", line 82, in get_format_modules
modules = _format_modules_cache.setdefault(lang, list(iter_format_modules(lang, settings.FORMAT_MODULE_PATH)))
File "C:\Python34\lib\site-packages\django\utils\formats.py", line 51, in iter_format_modules
if not check_for_language(lang):
File "C:\Python34\lib\site-packages\django\utils\translation\__init__.py", line 181, in check_for_language
return _trans.check_for_language(lang_code)
File "C:\Python34\lib\functools.py", line 472, in wrapper
result = user_function(*args, **kwds)
File "C:\Python34\lib\site-packages\django\utils\translation\trans_real.py", line 409, in check_for_language
if not language_code_re.search(lang_code):
TypeError: expected string or buffer
Here's my code for the update_blog1
from django.core.management.base import BaseCommand
from django.core.urlresolvers import reverse
from django.test.client import Client
import sys, os
from optparse import make_option
from P1config.settings import STATICBLOG_POST_DIRECTORY, STATICBLOG_COMPILE_DIRECTORY
class Command(BaseCommand):
help = "Compile blog posts from html to markdown, and upload images to S3 Defaults to processing only new blog posts"
option_list = BaseCommand.option_list + (
make_option(
'--all',
action='store_true',
dest='all',
default=False,
help='Get all blog posts, regardless of date'
),
make_option(
'--name',
action='store',
dest='post_name',
default=False,
help='Get named blog post'
),
)
def handle(self, *args, **options):
verbosity = int(options.get('verbosity'))
client = Client()
outdir = STATICBLOG_COMPILE_DIRECTORY
posts = []
previews = []
if options['all']:
if verbosity > 3:
print ('Compiling all blog posts')
posts = self._get_all_posts()
elif options['post_name']:
posts = self._get_named_posts(options['post_name'])
else:
if verbosity > 3:
print ('Compiling new blog posts')
posts = self._get_all_posts(new = True)
if verbosity > 3:
print ('%d posts found' % len(posts))
print ('----------------------------')
for post in posts:
if verbosity > 3:
print ("Compiling " + post['md_name'] + " to " + post['html_name'])
path = reverse('blog_static.views.archive') + post['path']
# path = '/preview/' + post['path']
resp = client.get(path)
if os.path.exists(outdir + post['path']) == False:
try:
with open(outdir + post['path'], 'r') as f:
pass
except IOError as e:
os.mkdir(outdir + post['path'])
with open(outdir + post['html_name'], 'wb') as f:
f.write(resp.content)
if len(posts) > 0 and verbosity > 3:
print ('----------------------------')
if verbosity > 3:
print ('Updating listings...')
print ('----------------------------')
path = reverse('blog_static.views.archive')
resp = client.get(path)
with open(STATICBLOG_COMPILE_DIRECTORY + 'index.html', 'wb') as f:
f.write(resp.content)
if verbosity > 3:
print ('Done')
def _get_all_posts(self, new = False):
posts = []
for item in os.listdir(STATICBLOG_POST_DIRECTORY):
post = self._create_post(item, new)
if post:
posts.append(post)
return posts
def _get_named_posts(self, post):
post_list = post.split(',')
posts = []
for item in post_list:
try:
with open(STATICBLOG_POST_DIRECTORY + item, 'r') as f:
post = self._create_post(item)
if post:
posts.append(post)
except IOError as e:
print ('\033[01;31m' + str(e) + '\033[0m', file= sys.stderr)
return posts
def _create_post(self, item, new = False):
outdir = STATICBLOG_POST_DIRECTORY
compiled_post = {
'md_name' : item,
'html_name' : '',
'path' : '',
'html' : '',
}
if item.endswith('.md'):
compiled_post['path'] = item.replace('.md', '')
compiled_post['html_name'] = compiled_post['path'] + '/index.html'
if new:
try:
with open(STATICBLOG_COMPILE_DIRECTORY + compiled_post['html_name'], 'r') as f:
return False
except IOError as e:
return compiled_post
else:
return compiled_post
If you look in the code, the methods _get_named_posts(), _create_post(), handle() each have a open() function embedded. Where these open() functions are (ex: open(outdir + post['path'], 'r' or open(STATICBLOG_POST_DIRECTORY + item, 'r')) is where the problem lies, as pointed out by PyCharm.
In my view.py file, I got have this...
# Django imports
from django.template import RequestContext
from django.shortcuts import render_to_response, render
from django.core.files.storage import get_storage_class
from django.core.files.base import ContentFile
from django.views.decorators.csrf import csrf_exempt
from django.db import models
# Create a 'shortcut' function to wrap request in RequestContext()
def render_response(req, *args, **kwargs):
"""Shortcut to wrap request in RequestContext"""
kwargs['context_instance'] = RequestContext(req)
return render_to_response(*args, **kwargs)
# Standard Python lib
import os, sys, urllib, hashlib
# 3rd party apps
import markdown
from markdown.inlinepatterns import ImagePattern, IMAGE_LINK_RE
# from config folder
from P1config.settings import STATICBLOG_COMPILE_DIRECTORY, \
STATICBLOG_POST_DIRECTORY, \
STATICBLOG_STORAGE
###################################################################################
class S3ImagePattern(ImagePattern):
""" Wrapper class to handle image matches in markdown document """
def handleMatch(self, match):
node = ImagePattern.handleMatch(self, match)
# check 'src' to ensure it is local
src = node.attrib.get('src')
storage_class = get_storage_class(STATICBLOG_STORAGE)
storage = storage_class()
# otherwise we need to do some downloading!
if 'http://' in src or 'https://' in src:
img_data = urllib.request.urlopen(src).read()
md5 = hashlib.md5()
md5.update(img_data)
name = md5.hexdigest() + '/' + os.path.basename(src)
else:
with open(STATICBLOG_POST_DIRECTORY + src) as fhandle:
img_data = fhandle.read()
name = src
print('Uploading ' + src, file=sys.stderr)
try:
storage.save(name, ContentFile(img_data))
node.attrib['src'] = storage.url(name)
print ('Uploaded ' + src + ' to ' + storage.url(name), file=sys.stderr)
except Exception as e:
print(str(e), file=sys.stderr)
print ('\033[01;31mUpload of %s failed\033[0m' % src, file=sys.stderr)
return node
def render_post(request, post_name):
""" Render a blog post based on a .post template
The used template is rendered as html in the folder defined
by STATICBLOG_COMPILE_DIRECTORY
"""
content = ""
mdown = markdown.Markdown(extensions = ['meta',])
mdown.inlinePatterns['image_link'] = S3ImagePattern(IMAGE_LINK_RE, mdown)
try:
post_file_dir = os.path.join(STATICBLOG_POST_DIRECTORY, post_name + '.md')
with open(post_file_dir, 'r') as pfDIR:
content = pfDIR.read() # opening and reading the ENTIRE '.md' document
html = mdown.convert(content) # converting file from '.md' to ".html"
except IOError as e:
print (str(e))
with open(os.path.join(STATICBLOG_POST_DIRECTORY, 'preview2.md')) as f:
content = f.read()
html = mdown.convert(content)
post = { 'content' : html, }
try:
post['date'] = mdown.Meta['date'][0]
post['title'] = mdown.Meta['title'][0]
post['author'] = mdown.Meta['author'][0]
post['summary'] = mdown.Meta['summary'][0]
post['tags'] = mdown.Meta['tags'][0]
except:
pass
meta = {}
if 'title' in post:
meta['title'] = post['title']
# Context Object containing the post, meta contexes
context = {'post' : post, 'meta' : meta}
return render_response( # but could I just use render?
request,
'post2.html',
context
)
def archive(request):
mdown = markdown.Markdown(extensions = ['meta',])
# Create an empty post list for now
posts = []
import string
# Look at every 'item' in the STATICBLOG_COMPILE_DIRECTORY
for item in os.listdir(STATICBLOG_COMPILE_DIRECTORY):
# if the 'item' in this directory ends with '.post' (like '.md' in a markdown file, or '.py' in a python file)
# More specifically, if there is a '.post' file located in this directory...
if item.endswith('.md'):
# ...continue on and...
continue
# ...attempt to use that 'item'
try:
# ...by opening it,
with open(os.path.join(STATICBLOG_POST_DIRECTORY, item + '.md')) as fhandle:
# ...reading the markdown file,
content = fhandle.read() # (opening and reading the ENTIRE '.md' document)
# ...and converting it to HTML.
mdown.convert(content) # (converting file from '.md' to ".html")
post = { 'name' : item, }
if 'title' in mdown.Meta and len(mdown.Meta['title'][0]) > 0:
# Add the markdown document's 'title' to post[]
# This stores the post's title from the "Meta" section of the '.md' document
post['title'] = mdown.Meta['title'][0]
# but if that doesnt work...
else:
# Add to the post list the item's Meta 'title', which simply takes \n
# the title of the .md document and removes the '-' from it, so that \n
# we can make it the post's title.
post['title'] = string.capwords(item.replace('-', ' '))
# ...and if there exists a 'date' in the item's meta attribute...
if 'date' in mdown.Meta:
# pass the 'date' info found in the Meta attribute to a 'date' \n
# variable created in the post list
post['date'] = mdown.Meta['date'][0]
posts.append(post)
except:
pass
from operator import itemgetter
posts = sorted(posts, key=itemgetter('date'))
posts.reverse()
return render_response( # but could I just use render?
request,
'archive.html',
{'posts' : posts}
)
#csrf_exempt
def handle_hook(request):
from django.http import HttpResponse
from django.core.management import call_command
result = call_command('update_blog', verbosity = 0)
return HttpResponse(result)
I dont know how to fix this. Can you let me know how to fix these issues? I don't know where I can put a string or buffer in my code.
when i disable proxy server in web browser setting and commenting the proxy handler coding it below code works fine.
import urllib2
import urllib2_file
import urllib
import random
import mimetypes
import string
from os import listdir
import time
from google.refine import refine
from google.refine import facet
proxy = urllib2.ProxyHandler({'http': '10.200.1.26'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
def encode_multipart(fields, files, boundary=None):
def escape_quote(s):
return s.replace('"', '\\"')
if boundary is None:
boundary = ''.join(random.choice(_BOUNDARY_CHARS) for i in range(30))
lines = []
for name, value in fields.items():
lines.extend((
'--{0}'.format(boundary),
'Content-Disposition: form-data; name="upload"', #.format(escape_quote(name)),
'',
str(value),
))
for name, value in files.items():
filename = value['filename']
if 'mimetype' in value:
mimetype = value['mimetype']
else:
mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
lines.extend((
'--{0}'.format(boundary),
'Content-Disposition: form-data; name="upload"; filename="{0}"'.format(escape_quote(filename)),
'Content-Type: {0}'.format(mimetype),
'',
value['content'],
))
lines.extend((
'--{0}--'.format(boundary),
'',
))
body = '\r\n'.join(lines)
headers = {
'Content-Type': 'multipart/form-data; boundary={0}'.format(boundary),
'Content-Length': str(len(body)),
}
return (body, headers)
_BOUNDARY_CHARS = string.digits + string.ascii_letters
u = urllib2.urlopen("http://127.0.0.1:3333/command/core/create-importing-job",data=urllib.urlencode({"test":""}))
a=u.read()
id=""
for i in a:
if(i.isdigit()):
id+=str(i)
# sample output '{ "jobID" : 1393566803991 }'
files = {}
pathtoXML = r"C:\75"
#pathtoXML = r"C:\AM\trial"
for i in listdir(pathtoXML):
files[i] = {'filename': i, 'content': open(pathtoXML + "\\"+ i).read()}
#load raw data using the job id found in
url = "http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobI="+id+"&subCommand=load-raw-data"
data,headers = encode_multipart({}, files)
#print len(data)
#print headers
req = urllib2.Request(url, data=data, headers=headers)
f = urllib2.urlopen(req)
f.read()
# get job status
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/get-importing-job-status?jobID="+id+"", "test")
u.read()
#from fileSelection update file selection
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&subCommand=update-file-selection&jobID="+id+"", "fileSelection=%5B0%2C1%2C2%2C3%5D")
u.read()
#init parser format text
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID="+id+"&subCommand=initialize-parser-ui&format=text%2Fxml")
u.read()
#update format and options
updateformatoptionurl = "http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID="+id+"&subCommand=update-format-and-options"
d=urllib.urlencode({"format":"text/xml","options":{"recordPath":["ArrayOfAfiles","Afiles"],"limit":-1,"includeFileSources":"false","guessCellValueTypes":"false"}})
u=urllib2.urlopen(updateformatoptionurl,d)
u.read()
'{"status":"ok"}'
#get-models
u=urllib2.urlopen("http://127.0.0.1:3333/command/core/get-models?importingJobID="+id)
u.read()
# create project from import job
createfromimporturl = "http://127.0.0.1:3333/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID="+id+"&subCommand=create-project"
d=urllib.urlencode({"format":"text/xml","options":{"recordPath":["ArrayOfAfiles","Afiles"],"limit":-1,"includeFileSources":"false","projectName":time.ctime()}})
u=urllib2.urlopen(createfromimporturl, d)
r=u.read()
After embedding proxy handler coding its not working when i ran the code its complaining:
Traceback (most recent call last):
File "C:\hari\trial.py", line 87, in <module>
u = urllib2.urlopen("http://127.0.0.1:3333/command/core/create-importing-job",data=urllib.urlencode({"test":""}))
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 391, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 409, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 369, in _call_chain
result = func(*args)
File "C:\Python27\urllib2_file.py", line 207, in http_open
return self.do_open(httplib.HTTP, req)
File "C:\Python27\urllib2_file.py", line 298, in do_open
return self.parent.error('http', req, fp, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 435, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 369, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 518, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 404: Not Found