Problem passing XML to HTTP Post api using urllib in Python - python

I am trying to access a REST api and need to call it with a line of XML for a filter condition. My apologies for providing code that others cannot access. When I execute this code, I get the error message listed below.
import urllib2
import urllib
import hashlib
import hmac
import time
import random
import base64
def MakeRequest():
url = 'https://api01.marketingstudio.com/API/Gateway/9'
publickey = ''
privatekey = ''
method = 'Query'
nonce = random.randrange(123400, 9999999)
age = int(time.time())
final = str(age) + '&' + str(nonce) + '&' + method.lower() + '&' + url.lower()
converted = hmac.new(privatekey, final, hashlib.sha1).digest()
authorization = 'AMS ' + publickey + ':' + base64.b64encode(converted)
xml_string = "<list><FilterItems><FilterItem attribute='pageNumber' value='1'/></FilterItems></list>"
form = {'XML':xml_string}
data = urllib.urlencode(form)
headers = {'Content-Type': 'application/xml'}
req = urllib2.Request(url,data,headers)
req.add_header('ams-method', method)
req.add_header('ams-nonce', nonce)
req.add_header('ams-age', age)
req.add_header('Authorization', authorization)
r = urllib2.urlopen(req)
print r.read()
MakeRequest();
Here is the error message.
Data at the root level is invalid. Line 1, position 1.
at Aprimo.REST.Core.RESTService.GetRequest(String URI, HttpRequest req)
at Aprimo.REST.RESTHandler.GetRequest(String apiUrl, HttpContext context)
at Aprimo.REST.RESTHandler.ProcessRequest(HttpContext context)
I think this has the correct logic and filter conditions, what should I look at to get this to work. Thanks.
Per #Mark's suggestion I removed the urlencode for the XML string and got the following TraceBack:
Traceback (most recent call last):
File "file.py", line 36, in <module>
MakeRequest();
File "file.py", line 32, in MakeRequest
r = urllib2.urlopen(req)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 126, in urlopen
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 392, in open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 410, in _open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 370, in _call_chain
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1194, in https_open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1155, in do_open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 941, in request
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 975, in _send_request
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 937, in endheaders
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 801, in _send_output
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 773, in send
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ssl.py", line 207, in sendall
TypeError: unhashable type

So the problem was with the formatting of the form variable and the encoding I was trying to do. Revising the following lines gets the call to work. I did not need to specify the headers.
xml_string = "<list><FilterItems><FilterItem attribute='pageNumber' value='1'/></FilterItems></list>"
data = (xml_string)
req = urllib2.Request(url,data)

Related

"Cannot use string pattern on bytes like object" when trying to parse htmls for emails

So I have a script I've been working with for a few days trying to get a list of emails from a csv I have, but now I've run into this roadblock. Here is the code:
import sys
try:
import urllib.request as urllib2
except ImportError:
import urllib2
import re
import csv
list1 = []
list2 = []
list3 = []
def addList():
with open('file.csv', 'rt') as f:
reader = csv.reader(f)
for row in reader:
for s in row:
list2.append(s)
def getAddress(url):
http = "http://"
https = "https://"
if http in url:
return url
elif https in url:
return url
else:
url = "http://" + url
return url
def parseAddress(url):
global list3
try:
website = urllib2.urlopen(getAddress(url))
html = website.read()
addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*#(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
global list1
list1.append(addys)
except urllib2.HTTPError as err:
print ("Cannot retrieve URL: HTTP Error Code: "), err.code
list3.append(url)
except urllib2.URLError as err:
print ("Cannot retrive URL: ") + err.reason[1]
list3.append(url)
def execute():
global list2
addList()
totalNum = len(list2)
atNum = 1
for s in list2:
parseAddress(s)
print ("Processing ") + str(atNum) + (" out of ") + str(totalNum)
atNum = atNum + 1
print ("Completed. Emails parsed: ") + str(len(list1)) + "."
### MAIN
def main():
global list2
execute()
global list1
myFile = open("finishedFile.csv", "w+")
wr = csv.writer(myFile, quoting=csv.QUOTE_ALL)
for s in list1:
wr.writerow(s)
myFile.close
global list3
failFile = open("failedSites.csv", "w+")
write = csv.writer(failFile, quoting=csv.QUOTE_ALL)
for j in list3:
write.writerow(j)
failFile.close
main()
and when I run it I get this error:
Traceback (most recent call last):
File "pagescanner.py", line 85, in <module>
main()
File "pagescanner.py", line 71, in main
execute()
File "pagescanner.py", line 60, in execute
parseAddress(s)
File "pagescanner.py", line 42, in parseAddress
addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*#(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
File "/usr/lib/python3.5/re.py", line 213, in findall
return _compile(pattern, flags).findall(string)
TypeError: cannot use a string pattern on a bytes-like object
So I've figured out that I need to figure out how to encode the html string into bytes for the encoding, and Tyler's answer below helped me do so but now I'm getting this error:
Traceback (most recent call last):
File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.5/http/client.py", line 1107, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.5/http/client.py", line 1152, in _send_request
self.endheaders(body)
File "/usr/lib/python3.5/http/client.py", line 1103, in endheaders
self._send_output(message_body)
File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
self.send(msg)
File "/usr/lib/python3.5/http/client.py", line 877, in send
self.connect()
File "/usr/lib/python3.5/http/client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "/usr/lib/python3.5/socket.py", line 712, in create_connection
raise err
File "/usr/lib/python3.5/socket.py", line 703, in create_connection
sock.connect(sa)
OSError: [Errno 22] Invalid argument
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 39, in parseAddress
website = urllib2.urlopen(getAddress(url))
File "/usr/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.5/urllib/request.py", line 466, in open
response = self._open(req, data)
File "/usr/lib/python3.5/urllib/request.py", line 484, in _open
'_open', req)
File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.5/urllib/request.py", line 1256, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 22] Invalid argument>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 85, in <module>
main()
File "pagescanner.py", line 71, in main
execute()
File "pagescanner.py", line 60, in execute
parseAddress(s)
File "pagescanner.py", line 51, in parseAddress
print ("Cannot retrive URL: ") + err.reason[1]
TypeError: 'OSError' object is not subscriptable
Does this mean that one of the urls from the list isn't a valid url? I thought I had finally removed all fo the bad urls from my csv file but I may need to take another look
To answer your question, you just need to decode the response properly.
Instead of html = website.read() try html = website.read().decode('utf-8')
See Convert bytes to a string
I'll also recommend a couple things that might make your life a little easier.
urllib.parse makes dealing with URLs much less of a headache and tends to make things a lot more readable when you inevitably encounter a bug somewhere.
https://docs.python.org/3.5/library/urllib.parse.html
The requests library is also pretty much the gold standard for dealing with HTTP requests and might help solve a bit of the confusion around encoding and other overhead from the standard urllib.request.
https://requests.readthedocs.io/en/master/
And beautifulsoup is a fantastic tool for dealing with HTML.
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#

Python urllib unicode exception in 302 redirection [duplicate]

This question already has an answer here:
Urllib Unicode Error, no unicode involved
(1 answer)
Closed 3 years ago.
The situation is:
I'm scraping one website, the urls for the pages follow the pattern:
http://www.pageadress/somestuff/ID-HERE/
Nothing unusual.
I have a lot of id's that i need to scrape and most of them work correctly.
However, the page behaves in portal-like way. In browser, when you enter such address, you get redirected to:
http://www.pageadress/somestuff/ID-HERE-title_of_subpage
What might be problematic is that sometimes that title might contain non-ascii characters (approximately 0.01% of cases), therefore (i think that's the issue) i get the exception:
File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.4/urllib/request.py", line 469, in open
response = meth(req, response)
File "/usr/lib/python3.4/urllib/request.py", line 579, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.4/urllib/request.py", line 501, in error
result = self._call_chain(*args)
File "/usr/lib/python3.4/urllib/request.py", line 441, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 684, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.4/urllib/request.py", line 463, in open
response = self._open(req, data)
File "/usr/lib/python3.4/urllib/request.py", line 481, in _open
'_open', req)
File "/usr/lib/python3.4/urllib/request.py", line 441, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 1210, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.4/urllib/request.py", line 1182, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.4/http/client.py", line 1088, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.4/http/client.py", line 1116, in _send_request
self.putrequest(method, url, **skips)
File "/usr/lib/python3.4/http/client.py", line 973, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 38-39: ordinal not in range(128).
The bizarre thing is that no unicode characters in url i'm redirected to are actually on position 38-39, but there are on others.
The code being used:
import socket
import urllib.parse
import urllib.request
socket.setdefaulttimeout(30)
url = "https://www.bettingexpert.com/archive/tip/3207221"
headers = {'User-Agent': 'Mozilla/5.0'}
content = urllib.request.urlopen(urllib.request.Request(url, None, headers)).read().decode('utf-8')
Any method to get around it, preferably without using other libraries?
//Oh the glorious world of python, creating 1000s of problems i wouldn't even think were possible if i was writing in ruby.
So, i've found out solution to my specific problem.
I've just gathered the remaining part of the 'url' from their api, and after some minor transformations i can access the page without any redirections.
That, of course, doesn't mean that i solved the general problem- it might come back later in the future, so i've developed a 'solution'.
By posting this code here i've basically guaranteed myself that i won't ever be employed as programmer, so don't look at it if you're eating.
"Capybara" gem and poltergeist needed because why not?
#test.py
import socket
import urllib.parse
import urllib.request
import os
tip_id = 3207221
socket.setdefaulttimeout(30)
url = "http://www.bettingexpert.com/archive/tip/" + tip_id.__str__()
headers = {'User-Agent': 'Mozilla/5.0'}
try:
content = urllib.request.urlopen(urllib.request.Request(url, None, headers)).read().decode('utf-8')
except UnicodeEncodeError:
print("Overkill activated")
os.system('ruby test.rb ' + tip_id.__str__())
with open(tip_id.__str__(), 'r') as file:
content = file.read()
os.remove(tip_id.__str__())
print(content)
.
#test.rb
require 'capybara'
require 'capybara/dsl'
require 'capybara/poltergeist'
Capybara.register_driver :poltergeist_no_timeout do |app|
driver = Capybara::Poltergeist::Driver.new(app, timeout: 30)
driver.browser.url_blacklist = %w(
http://fonts.googleapis.com
http://html5shiv.googlecode.com
)
driver
end
Capybara.default_driver = :poltergeist_no_timeout
Capybara.run_server = false
include Capybara::DSL
begin
page.reset_session!
page.visit("http://www.bettingexpert.com/archive/tip/#{ARGV[0]}")
rescue
retry
end
File.open(ARGV[0], 'w') do |file|
file.print(page.html)
end

google distance matrix with python

My problem is i want to get the distance for around 4000 lat longs . I have used Google Direction matrix service for this. To certain extend it worked fine for me. After that i am getting the error of Invalid request .Its because of GET method. I want a solution how can i use Distance matrix service with POST using Python.
Appreciate your help on this. Thanks in Advance.
I am writing a google distance matrix with python using appengine, i got struck with urllib post method
code:
url = 'http://maps.googleapis.com/maps/api/distancematrix/json'
conn = getConnection()
cursor = conn.cursor()
origins=[]
try:
cursor.execute('select username,lat,lng,cabNo,orderno from tripsheet order by username;')
origins= cursor.fetchall()
except:
self.response.out.write("Some thing bad happened")
conn.close()
responseArray= []
for o in origins :
origin= {}
key= "blah"
origin = {"name":o[0],"key":key, "latitude":o[1],"longitude":o[2],"cabNo":o[3],"order":o[4]}
responseArray.append(origin)
url=url+o[1]+','+o[2]+'|'
values = {
'sensor' : 'false',
'mode' : 'driving',
'avoid' : 'tolls',
'destinations': '%s,%s' % (destination["lat"] ,destination["lon"])
}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
the_page = response.read()
self.response.out.write(the_page)
i am trying passing around 4000 origin and single destination.
getting below error message because it is taking as GET . I wanted to convert into POST using urllib2:
Traceback (most recent call last):
File "/home/xxx/Projects/google_appengine/google/appengine/ext/webapp/_webapp25.py", line 714, in __call__
handler.get(*groups)
File "/home/xxx/4.2WorkSpace/RouteOptimization/src/main.py", line 41, in get
self.calculate_indv_distance(destination)
File "/home/xxx/4.2WorkSpace/RouteOptimization/src/main.py", line 109, in calculate_indv_distance
response = urllib2.urlopen(req)
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 400, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 418, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1207, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1182, in do_open
r = h.getresponse()
File "/home/xxx/Projects/google_appengine/google/appengine/dist/httplib.py", line 222, in getresponse
deadline=self.timeout)
File "/home/xxx/Projects/google_appengine/google/appengine/api/urlfetch.py", line 266, in fetch
return rpc.get_result()
File "/home/xxx/Projects/google_appengine/google/appengine/api/apiproxy_stub_map.py", line 604, in get_result
return self.__get_result_hook(self)
File "/home/xxx/Projects/google_appengine/google/appengine/api/urlfetch.py", line 370, in _get_fetch_result
'Invalid request URL: ' + url + error_detail)
InvalidURLError: Invalid request URL: http://maps.googleapis.com/maps/api/distancematrix/
Any help really appreciate.

Socket error when using gdata Youtube API in Python

I'm using gdata to map YouTube URLs to video titles, using the following code:
import gdata.youtube.service as youtube
import re
import queue
import urlparse
ytservice = youtube.YouTubeService()
ytservice.ssl = True
ytservice.developer_key = '' # snip
class youtube(mediaplugin):
def __init__(self, parsed_url):
self.url = parsed_url
self.video_id = urlparse.parse_qs(parsed_url.query)['v'][0]
self.ytdata = ytservice.GetYouTubeVideoEntry(self.video_id)
print self.ytdata
I get the following socket exception when calling service.GetYouTubeVideoEntry():
File "/Users/haldean/Documents/qpi/qpi/media.py", line 21, in __init__
self.ytdata = ytservice.GetYouTubeVideoEntry(self.video_id)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/gdata/youtube/service.py", line 210, in GetYouTubeVideoEntry
return self.Get(uri, converter=gdata.youtube.YouTubeVideoEntryFromString)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/gdata/service.py", line 1069, in Get
headers=extra_headers)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/atom/__init__.py", line 93, in optional_warn_function
return f(*args, **kwargs)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/atom/service.py", line 186, in request
data=data, headers=all_headers)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/atom/http_interface.py", line 148, in perform_request
return http_client.request(operation, url, data=data, headers=headers)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/atom/http.py", line 163, in request
connection.endheaders()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 937, in endheaders
self._send_output(message_body)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 797, in _send_output
self.send(msg)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 759, in send
self.connect()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1140, in connect
self.timeout, self.source_address)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 553, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
gaierror: [Errno 8] nodename nor servname provided, or not known
I'm at a loss as to how to even begin debugging this. Any ideas appreciated. Thanks!
Edit:
In response to a question asked in comments, video_id is qh-mwjF-OMo and parsed_url is:
ParseResult(scheme=u'http', netloc=u'www.youtube.com', path=u'/watch', params='', query=u'v=qh-mwjF-OMo&feature=g-user-u', fragment='')
My mistake was that the video_id should be passed as a keyword parameter, like so:
self.ytdata = ytservice.GetYouTubeVideoEntry(video_id=self.video_id)
It seems that the socket exception is the only layer of gdata that will throw an exception; it tries to get a URL blindly based on the arguments and it only fails when the URL fetch fails.

Send XML using urllib

Following this link I tried sending a XML file to my web service using GET:
import urllib
from createfile import XML
URL = "http://http://localhost:8080/mywebservice
parameter = urllib.urlencode({'XML': XML})
response = urllib.urlopen(URL + "?%s" % parameter)
print response.read()
But it gives me this error:
Traceback (most recent call last):
File "C:\eclipse\testing_workspace\http tester\src\Main.py", line 15, in <module>
response = urllib.urlopen(URL + "?%s" % parameter)
File "C:\Python27\lib\urllib.py", line 84, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 205, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 331, in open_http
h = httplib.HTTP(host)
File "C:\Python27\lib\httplib.py", line 1047, in __init__
self._setup(self._connection_class(host, port, strict))
File "C:\Python27\lib\httplib.py", line 681, in __init__
self._set_hostport(host, port)
File "C:\Python27\lib\httplib.py", line 706, in _set_hostport
raise InvalidURL("nonnumeric port: '%s'" % host[i+1:])
httplib.InvalidURL: nonnumeric port: ''
But if I use POST method described in that link, it works good, my problem is that I need to use GET, so why I am getting thoose errors ?
response = urllib.urlopen(URL, parameter) // this works
Sending a XML file through a GET request is bare nonsense.
Use POST instead.

Categories