Send XML using urllib

Send XML using urllib - python

Following this link I tried sending a XML file to my web service using GET:
import urllib
from createfile import XML
URL = "http://http://localhost:8080/mywebservice
parameter = urllib.urlencode({'XML': XML})
response = urllib.urlopen(URL + "?%s" % parameter)
print response.read()
But it gives me this error:
Traceback (most recent call last):
File "C:\eclipse\testing_workspace\http tester\src\Main.py", line 15, in <module>
response = urllib.urlopen(URL + "?%s" % parameter)
File "C:\Python27\lib\urllib.py", line 84, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 205, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 331, in open_http
h = httplib.HTTP(host)
File "C:\Python27\lib\httplib.py", line 1047, in __init__
self._setup(self._connection_class(host, port, strict))
File "C:\Python27\lib\httplib.py", line 681, in __init__
self._set_hostport(host, port)
File "C:\Python27\lib\httplib.py", line 706, in _set_hostport
raise InvalidURL("nonnumeric port: '%s'" % host[i+1:])
httplib.InvalidURL: nonnumeric port: ''
But if I use POST method described in that link, it works good, my problem is that I need to use GET, so why I am getting thoose errors ?
response = urllib.urlopen(URL, parameter) // this works

Sending a XML file through a GET request is bare nonsense.
Use POST instead.

Related

"Cannot use string pattern on bytes like object" when trying to parse htmls for emails

So I have a script I've been working with for a few days trying to get a list of emails from a csv I have, but now I've run into this roadblock. Here is the code:
import sys
try:
import urllib.request as urllib2
except ImportError:
import urllib2
import re
import csv
list1 = []
list2 = []
list3 = []
def addList():
with open('file.csv', 'rt') as f:
reader = csv.reader(f)
for row in reader:
for s in row:
list2.append(s)
def getAddress(url):
http = "http://"
https = "https://"
if http in url:
return url
elif https in url:
return url
else:
url = "http://" + url
return url
def parseAddress(url):
global list3
try:
website = urllib2.urlopen(getAddress(url))
html = website.read()
addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*#(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
global list1
list1.append(addys)
except urllib2.HTTPError as err:
print ("Cannot retrieve URL: HTTP Error Code: "), err.code
list3.append(url)
except urllib2.URLError as err:
print ("Cannot retrive URL: ") + err.reason[1]
list3.append(url)
def execute():
global list2
addList()
totalNum = len(list2)
atNum = 1
for s in list2:
parseAddress(s)
print ("Processing ") + str(atNum) + (" out of ") + str(totalNum)
atNum = atNum + 1
print ("Completed. Emails parsed: ") + str(len(list1)) + "."
### MAIN
def main():
global list2
execute()
global list1
myFile = open("finishedFile.csv", "w+")
wr = csv.writer(myFile, quoting=csv.QUOTE_ALL)
for s in list1:
wr.writerow(s)
myFile.close
global list3
failFile = open("failedSites.csv", "w+")
write = csv.writer(failFile, quoting=csv.QUOTE_ALL)
for j in list3:
write.writerow(j)
failFile.close
main()
and when I run it I get this error:
Traceback (most recent call last):
File "pagescanner.py", line 85, in <module>
main()
File "pagescanner.py", line 71, in main
execute()
File "pagescanner.py", line 60, in execute
parseAddress(s)
File "pagescanner.py", line 42, in parseAddress
addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*#(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
File "/usr/lib/python3.5/re.py", line 213, in findall
return _compile(pattern, flags).findall(string)
TypeError: cannot use a string pattern on a bytes-like object
So I've figured out that I need to figure out how to encode the html string into bytes for the encoding, and Tyler's answer below helped me do so but now I'm getting this error:
Traceback (most recent call last):
File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.5/http/client.py", line 1107, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.5/http/client.py", line 1152, in _send_request
self.endheaders(body)
File "/usr/lib/python3.5/http/client.py", line 1103, in endheaders
self._send_output(message_body)
File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
self.send(msg)
File "/usr/lib/python3.5/http/client.py", line 877, in send
self.connect()
File "/usr/lib/python3.5/http/client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "/usr/lib/python3.5/socket.py", line 712, in create_connection
raise err
File "/usr/lib/python3.5/socket.py", line 703, in create_connection
sock.connect(sa)
OSError: [Errno 22] Invalid argument
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 39, in parseAddress
website = urllib2.urlopen(getAddress(url))
File "/usr/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.5/urllib/request.py", line 466, in open
response = self._open(req, data)
File "/usr/lib/python3.5/urllib/request.py", line 484, in _open
'_open', req)
File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.5/urllib/request.py", line 1256, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 22] Invalid argument>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 85, in <module>
main()
File "pagescanner.py", line 71, in main
execute()
File "pagescanner.py", line 60, in execute
parseAddress(s)
File "pagescanner.py", line 51, in parseAddress
print ("Cannot retrive URL: ") + err.reason[1]
TypeError: 'OSError' object is not subscriptable
Does this mean that one of the urls from the list isn't a valid url? I thought I had finally removed all fo the bad urls from my csv file but I may need to take another look

To answer your question, you just need to decode the response properly.
Instead of html = website.read() try html = website.read().decode('utf-8')
See Convert bytes to a string
I'll also recommend a couple things that might make your life a little easier.
urllib.parse makes dealing with URLs much less of a headache and tends to make things a lot more readable when you inevitably encounter a bug somewhere.
https://docs.python.org/3.5/library/urllib.parse.html
The requests library is also pretty much the gold standard for dealing with HTTP requests and might help solve a bit of the confusion around encoding and other overhead from the standard urllib.request.
https://requests.readthedocs.io/en/master/
And beautifulsoup is a fantastic tool for dealing with HTML.
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#

getting gai & socket errors and I'm not sure what either of these mean

I'm using a program I found on github to scrape emails from a website list I have in an excel spreadsheet that I downloaded in csv. Running it gives me this error:
Cannot retrive URL:
Traceback (most recent call last):
File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.5/http/client.py", line 1107, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.5/http/client.py", line 1152, in _send_request
self.endheaders(body)
File "/usr/lib/python3.5/http/client.py", line 1103, in endheaders
self._send_output(message_body)
File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
self.send(msg)
File "/usr/lib/python3.5/http/client.py", line 877, in send
self.connect()
File "/usr/lib/python3.5/http/client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "/usr/lib/python3.5/socket.py", line 694, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
File "/usr/lib/python3.5/socket.py", line 733, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno -5] No address associated with hostname
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 39, in parseAddress
website = urllib2.urlopen(getAddress(url))
File "/usr/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.5/urllib/request.py", line 466, in open
response = self._open(req, data)
File "/usr/lib/python3.5/urllib/request.py", line 484, in _open
'_open', req)
File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.5/urllib/request.py", line 1256, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno -5] No address associated with hostname>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 85, in <module>
main()
File "pagescanner.py", line 71, in main
execute()
File "pagescanner.py", line 60, in execute
parseAddress(s)
File "pagescanner.py", line 51, in parseAddress
print ("Cannot retrive URL: ") + err.reason[1]
TypeError: 'gaierror' object does not support indexing
and here's the code I'm using
import sys
try:
import urllib.request as urllib2
except ImportError:
import urllib2
import re
import csv
list1 = []
list2 = []
list3 = []
def addList():
with open('file.csv', 'rt') as f:
reader = csv.reader(f)
for row in reader:
for s in row:
list2.append(s)
def getAddress(url):
http = "http://"
https = "https://"
if http in url:
return url
elif https in url:
return url
else:
url = "http://" + url
return url
def parseAddress(url):
global list3
try:
website = urllib2.urlopen(getAddress(url))
html = website.read()
addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*#(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
global list1
list1.append(addys)
except urllib2.HTTPError as err:
print ("Cannot retrieve URL: HTTP Error Code: "), err.code
list3.append(url)
except urllib2.URLError as err:
print ("Cannot retrive URL: ") + err.reason[1]
list3.append(url)
def execute():
global list2
addList()
totalNum = len(list2)
atNum = 1
for s in list2:
parseAddress(s)
print ("Processing ") + str(atNum) + (" out of ") + str(totalNum)
atNum = atNum + 1
print ("Completed. Emails parsed: ") + str(len(list1)) + "."
### MAIN
def main():
global list2
execute()
global list1
myFile = open("finishedFile.csv", "w+")
wr = csv.writer(myFile, quoting=csv.QUOTE_ALL)
for s in list1:
wr.writerow(s)
myFile.close
global list3
failFile = open("failedSites.csv", "w+")
write = csv.writer(failFile, quoting=csv.QUOTE_ALL)
for j in list3:
write.writerow(j)
failFile.close
main()
I'm assuming it has something to do with me trying to translate thr code from python2 to python3, but I'm stumped.

If it is a URL error, it may be that you are trying to parse an invalid address from your Excel spreadsheet, so showing those URLs would be helpful. If it is a socket error, you can try resolving this issue by using the socket class to parse the URL and get the host name:
import socket
ipAdd = socket.gethostbyname(url) # Retrieves URL's IP address (example: 'www.google.com'
website = urllib2.urlopen(getAddress(ipAdd)) # Formats to http:// or https://
html = website.read() # Gets HTML code
If that doesn't work, you may need to install Flask, a Web application framework designed to act as a socket (to communicate between servers and applications). It is supported in Python 3.5 and newer, Python 2.7, and PyPi:
$ pip install Flask

Scraping in python to using open api

i want to scraping "www.naver.com"
so i tried to scraping using open api
i wrote code following this:
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
defaultURL = 'http://openapi.naver.com/search?&'
key = 'key=keyvalue'
target='&target=news'
sort='&sort=sim'
start='&start=1'
display='&display=100'
query='&query='+urllib.parse.quote_plus(str(input("write:")))
fullURL=defaultURL+key+target+sort+start+display+query
print(fullURL)
file=open("C:\\Users\\kimty\\Desktop\\k\\python\\N\\naver_news.txt","w",encoding='utf-8')
f=urllib.request.urlopen(fullURL)
resultXML=f.read()
xmlsoup=BeautifulSoup(resultXML,'html.parser')
items=xmlsoup.find._all('item')
for item in items:
file.write('---------------------------------------\n')
file.write('title :'+item.tile.get_text(strip=True)+'\n')
file.write('contents : '+item.description.get_text(strip=True)+'\n')
file.write('\n')
file.close()
but python shell only show this
============= RESTART: C:\Users\kimty\Desktop\kpython\N\N.py =============
write:lee
http://openapi.naver.com/search?&key=keyvalue&target=news&sort=sim&start=1&display=100&query=lee
Traceback (most recent call last):
File "C:\Users\kimty\Desktop\k\python\N\N.py", line 19, in <module>
f=urllib.request.urlopen(fullURL)
File "C:\Python34\lib\urllib\request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "C:\Python34\lib\urllib\request.py", line 464, in open
response = self._open(req, data)
File "C:\Python34\lib\urllib\request.py", line 482, in _open
'_open', req)
File "C:\Python34\lib\urllib\request.py", line 442, in _call_chain
result = func(*args)
File "C:\Python34\lib\urllib\request.py", line 1211, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Python34\lib\urllib\request.py", line 1186, in do_open
r = h.getresponse()
File "C:\Python34\lib\http\client.py", line 1227, in getresponse
response.begin()
File "C:\Python34\lib\http\client.py", line 386, in begin
version, status, reason = self._read_status()
File "C:\Python34\lib\http\client.py", line 356, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine: ''
why this happening?
what about that python shell talk to me?
i am using windows 8.1 64x, python 3.4.4

This http.client.BadStatusLine is a subclass of http.client.HTTPException. It gave you a http error back, maybe your API key is wrong! If I try to access the link with my browser it also gives me an error.
This is the exact address you tried to request.
Edit
Some people have fixed this error by importing the http lib.

Socket error when using gdata Youtube API in Python

I'm using gdata to map YouTube URLs to video titles, using the following code:
import gdata.youtube.service as youtube
import re
import queue
import urlparse
ytservice = youtube.YouTubeService()
ytservice.ssl = True
ytservice.developer_key = '' # snip
class youtube(mediaplugin):
def __init__(self, parsed_url):
self.url = parsed_url
self.video_id = urlparse.parse_qs(parsed_url.query)['v'][0]
self.ytdata = ytservice.GetYouTubeVideoEntry(self.video_id)
print self.ytdata
I get the following socket exception when calling service.GetYouTubeVideoEntry():
File "/Users/haldean/Documents/qpi/qpi/media.py", line 21, in __init__
self.ytdata = ytservice.GetYouTubeVideoEntry(self.video_id)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/gdata/youtube/service.py", line 210, in GetYouTubeVideoEntry
return self.Get(uri, converter=gdata.youtube.YouTubeVideoEntryFromString)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/gdata/service.py", line 1069, in Get
headers=extra_headers)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/atom/__init__.py", line 93, in optional_warn_function
return f(*args, **kwargs)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/atom/service.py", line 186, in request
data=data, headers=all_headers)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/atom/http_interface.py", line 148, in perform_request
return http_client.request(operation, url, data=data, headers=headers)
File "/Users/haldean/Documents/qpi/lib/python2.7/site-packages/atom/http.py", line 163, in request
connection.endheaders()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 937, in endheaders
self._send_output(message_body)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 797, in _send_output
self.send(msg)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 759, in send
self.connect()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1140, in connect
self.timeout, self.source_address)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 553, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
gaierror: [Errno 8] nodename nor servname provided, or not known
I'm at a loss as to how to even begin debugging this. Any ideas appreciated. Thanks!
Edit:
In response to a question asked in comments, video_id is qh-mwjF-OMo and parsed_url is:
ParseResult(scheme=u'http', netloc=u'www.youtube.com', path=u'/watch', params='', query=u'v=qh-mwjF-OMo&feature=g-user-u', fragment='')

My mistake was that the video_id should be passed as a keyword parameter, like so:
self.ytdata = ytservice.GetYouTubeVideoEntry(video_id=self.video_id)
It seems that the socket exception is the only layer of gdata that will throw an exception; it tries to get a URL blindly based on the arguments and it only fails when the URL fetch fails.

Problem passing XML to HTTP Post api using urllib in Python

I am trying to access a REST api and need to call it with a line of XML for a filter condition. My apologies for providing code that others cannot access. When I execute this code, I get the error message listed below.
import urllib2
import urllib
import hashlib
import hmac
import time
import random
import base64
def MakeRequest():
url = 'https://api01.marketingstudio.com/API/Gateway/9'
publickey = ''
privatekey = ''
method = 'Query'
nonce = random.randrange(123400, 9999999)
age = int(time.time())
final = str(age) + '&' + str(nonce) + '&' + method.lower() + '&' + url.lower()
converted = hmac.new(privatekey, final, hashlib.sha1).digest()
authorization = 'AMS ' + publickey + ':' + base64.b64encode(converted)
xml_string = "<list><FilterItems><FilterItem attribute='pageNumber' value='1'/></FilterItems></list>"
form = {'XML':xml_string}
data = urllib.urlencode(form)
headers = {'Content-Type': 'application/xml'}
req = urllib2.Request(url,data,headers)
req.add_header('ams-method', method)
req.add_header('ams-nonce', nonce)
req.add_header('ams-age', age)
req.add_header('Authorization', authorization)
r = urllib2.urlopen(req)
print r.read()
MakeRequest();
Here is the error message.
Data at the root level is invalid. Line 1, position 1.
at Aprimo.REST.Core.RESTService.GetRequest(String URI, HttpRequest req)
at Aprimo.REST.RESTHandler.GetRequest(String apiUrl, HttpContext context)
at Aprimo.REST.RESTHandler.ProcessRequest(HttpContext context)
I think this has the correct logic and filter conditions, what should I look at to get this to work. Thanks.
Per #Mark's suggestion I removed the urlencode for the XML string and got the following TraceBack:
Traceback (most recent call last):
File "file.py", line 36, in <module>
MakeRequest();
File "file.py", line 32, in MakeRequest
r = urllib2.urlopen(req)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 126, in urlopen
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 392, in open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 410, in _open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 370, in _call_chain
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1194, in https_open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1155, in do_open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 941, in request
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 975, in _send_request
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 937, in endheaders
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 801, in _send_output
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 773, in send
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/ssl.py", line 207, in sendall
TypeError: unhashable type

So the problem was with the formatting of the form variable and the encoding I was trying to do. Revising the following lines gets the call to work. I did not need to specify the headers.
xml_string = "<list><FilterItems><FilterItem attribute='pageNumber' value='1'/></FilterItems></list>"
data = (xml_string)
req = urllib2.Request(url,data)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Send XML using urllib - python

Sending a XML file through a GET request is bare nonsense. Use POST instead.

Related

"Cannot use string pattern on bytes like object" when trying to parse htmls for emails

getting gai & socket errors and I'm not sure what either of these mean

Scraping in python to using open api

Socket error when using gdata Youtube API in Python

Problem passing XML to HTTP Post api using urllib in Python

Categories

Resources