How to parse XML in Python - retrieve elements within tree/root - python
I am currently writing a Python script for some web application testing and am running into some issues. I’d classify myself as a beginner in Python so explanations behind possible solutions would be much appreciated.
The goal of this script is to take in an xml file, which contains website request and response details, and parse it out to give me easy to ingest data that I can run validation checks on.
My sample XML file looks like this:
<?xml version="1.1"?>
<items exportTime="Mon Mar 14 14:28:18 EDT 2022">
<item>
<time>Mon Mar 14 13:59:37 EDT 2022</time>
<url><![CDATA[https://www.youtube.com/sw.js_data]]></url>
<protocol>https</protocol>
<method><![CDATA[GET]]></method>
<path><![CDATA[/sw.js_data]]></path>
<extension>null</extension>
<request base64="false"><![CDATA[GET /sw.js_data HTTP/2
Host: www.youtube.com
Accept: */*
Sec-Fetch-Site: same-origin
Sec-Fetch-Mode: cors
Sec-Fetch-Dest: empty
Referer: https://www.youtube.com/sw.js
]]></request>
<status>200</status>
<mimetype>JSON</mimetype>
<response base64="false"><![CDATA[HTTP/2 200 OK
Content-Type: application/json; charset=utf-8
X-Content-Type-Options: nosniff
Cache-Control: no-cache, no-store, max-age=0, must-revalidate
Pragma: no-cache
Expires: Mon, 01 Jan 1990 00:00:00 GMT
Date: Mon, 14 Mar 2022 17:59:34 GMT
Content-Disposition: attachment; filename="response.bin"; filename*=UTF-8''response.bin
Strict-Transport-Security: max-age=31536000
X-Frame-Options: SAMEORIGIN
Cross-Origin-Opener-Policy-Report-Only: same-origin; report-to="ATmXEA_XZXH6CdbrmjUzyTbVgxu22C8KYH7NsxKbRt94"
Permissions-Policy: ch-ua-arch=*, ch-ua-bitness=*, ch-ua-full-version=*, ch-ua-full-version-list=*, ch-ua-model=*, ch-ua-platform=*, ch-ua-platform-version=*
Accept-Ch: Sec-CH-UA-Arch, Sec-CH-UA-Bitness, Sec-CH-UA-Full-Version, Sec-CH-UA-Full-Version-List, Sec-CH-UA-Model, Sec-CH-UA-Platform, Sec-CH-UA-Platform-Version
Server: ESF
X-Xss-Protection: 0
Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"
)]}'
[["yt.sw.adr",null,[[["en","US","US","75.188.116.252",null,null,1,null,[],null,null,"","","CgtaVS1FWnl4ZTJEZyiGhb6RBg%3D%3D","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36,gzip(gfe)",1,"2.20220311.01.00","Windows","10.0",null,null,null,"https://www.youtube.com/sw.js_data",[],null,null,null,null,null,null,null,[],null,null,null,null,null,null,null,null,2,1,"vip\u003d142.250.190.14,server_port\u003d443,client_port\u003d57491,tcp_connection_request_count\u003d0,header_order\u003dHCUAREL,gfe_version\u003d2.764.3,ssl,ssl_info\u003dTLSv1.3:RNA:T,tlsext\u003dS,sni\u003dwww.youtube.com,hex_encoded_client_hello\u003d130213011303c02cc02bcca9c030cca8c02f009fccaa00a3009e00a2c024c028c023c027006b006a00670040c02ec032c02dc031c026c02ac025c029c00ac014c009c0130039003800330032c005c00fc004c00e009d009c003d003c0035002fc008c01200160013c003c00d000a00ff-00-00000005000a000b000d00320010001100170023002b002d0033,c\u003d1302,pn\u003dalpn,ja3\u003d2d5bd942ebf308df61e1572861d146f6,rtt_source\u003dh2_ping,rtt\u003d41,srtt\u003d80,client_protocol\u003dh2,client_transport\u003dtcp,gfe\u003dacorde13.prod.google.com,pzf\u003dWindows NT kernel [4:118+10:0:1460:mss*44/8:mss/nop/ws/nop/nop/sok:df/id+:0] [generic tos:0x20],vip_region\u003ddefault,asn\u003d10796,cc\u003dUS,eid\u003dhoIvYvCZG42E8wes0aCgCw,scheme\u003dhttps,location\u003dord37s32,metro\u003dORD",null,null,0,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,[null,null,null,null,null,"CIaFvpEGEP_3rQUQmOqtBRC3y60FEPX4rQUQ1vitBRDI0_0SENi-rQUQkfj8Eg%3D%3D"],null,null,2,[],null,null,null,null,null,null,null,null,null,null,null,null,null,"America/New_York",null,null,["US",2],null,null,[],"Chrome","99.0.4844.51",null,null,null,null,null,null,null,null,null,null,null,[],[]],null,[null,null,null,null,null,null,null,null,null,null,null,[],null,null,null,0],null,[null,null,null,null,null,null,1,null,null,null,null,null,null,null,[],null,[1647280774452302,176365571,201449727],null,null,null,null,null,null,null,null,[],null,[]],[null,"IhMIzqigi5fG9gIVAyCDCh3/4AEM"],null,null,null,null,null,[]],"AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",null,null,"","QUFFLUhqbnREclEzblJmc25GVF9XSXQ1dFZQSm9sRGlmQXxBQ3Jtc0tuU3huS1RoOHQyaFlqN0dLdm4wcGMweXp0OURWQU5RbEJKRko1TlhGYjBoZ3N1Nnpla3QxUFRkN19uaWxoQVZTV0FRUGh0cUw2ckRWbmh5bGhxYkRjNFc2cUREbjB4MnFxMEpval9HUXNZeWU5d1Ztaw\u003d\u003d","CgtaVS1FWnl4ZTJEZyiGhb6RBg%3D%3D"],"Vf114d778||"]]]]></response>
<comment></comment>
</item>
<item>
<time>Mon Mar 14 13:59:14 EDT 2022</time>
<url><![CDATA[https://www.google.com/client_204?&atyp=i&biw=1440&bih=849&dpr=1.5&ei=Z4IvYpTtF5LU9AP1nIOICQ]]></url>
<protocol>https</protocol>
<method><![CDATA[GET]]></method>
<path><![CDATA[/client_204?&atyp=i&biw=1440&bih=849&dpr=1.5&ei=Z4IvYpTtF5LU9AP1nIOICQ]]></path>
<extension>null</extension>
<request base64="false"><![CDATA[GET /client_204?&atyp=i&biw=1440&bih=849&dpr=1.5&ei=Z4IvYpTtF5LU9AP1nIOICQ HTTP/2
Host: www.google.com
Sec-Ch-Ua-Arch: "x86"
Sec-Ch-Ua-Full-Version: "99.0.4844.51"
Sec-Ch-Ua-Platform-Version: "10.0.0"
Sec-Ch-Ua-Bitness: "64"
Sec-Ch-Ua-Model:
Sec-Ch-Ua-Platform: "Windows"
Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8
X-Client-Data: CJDnygE=
Sec-Fetch-Site: same-origin
Sec-Fetch-Mode: no-cors
Sec-Fetch-Dest: image
Referer: https://www.google.com/
Accept-Encoding: gzip, deflate
Accept-Language: en-US,en;q=0.9
]]></request>
<status>204</status>
<mimetype></mimetype>
<response base64="false"><![CDATA[HTTP/2 204 No Content
Content-Type: text/html; charset=UTF-8
Strict-Transport-Security: max-age=31536000
Content-Security-Policy: object-src 'none';base-uri 'self';script-src 'nonce-9KQUw4dRjvKnx/zTrOblTQ==' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/cdt1
Bfcache-Opt-In: unload
Date: Mon, 14 Mar 2022 17:59:10 GMT
Server: gws
Content-Length: 0
X-Xss-Protection: 0
X-Frame-Options: SAMEORIGIN
Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"
]]></response>
<comment></comment>
</item>
</items>
The XML seems a little strangely formatted around the headers but I would like to just extract the request host and the response headers from both items (google and youtube) and put them into a list. I have made some progress but I'm having trouble running a for loop to get just these values for both items. I'm not sure if I'm able to call the tag to get this data or what the best way to go about this is.
My current code is below:
tree = ET.parse('googleandyoutube.xml')
root = tree.getroot()
new = ET.tostring(root, encoding='utf8').decode('utf8')
#below loop kind of works in getting about the right information but it does not do it for both items
for item in root.iter('item'):
print(response.text)
result:
HTTP/2 204 No Content
Content-Type: text/html; charset=UTF-8
Strict-Transport-Security: max-age=31536000
Content-Security-Policy: object-src 'none';base-uri 'self';script-src 'nonce-9KQUw4dRjvKnx/zTrOblTQ==' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/cdt1
X-Xss-Protection: 0
X-Frame-Options: SAMEORIGIN
HTTP/2 204 No Content
Content-Type: text/html; charset=UTF-8
Strict-Transport-Security: max-age=31536000
Content-Security-Policy: object-src 'none';base-uri 'self';script-src 'nonce-9KQUw4dRjvKnx/zTrOblTQ==' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/cdt1
X-Xss-Protection: 0
X-Frame-Options: SAMEORIGIN
Any help is greatly appreciated!
Edit: extra code at end of first website response below
X-Xss-Protection: 0
Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"
)]}'
[["yt.sw.adr",null,[[["en","US","US","75.188.116.252",null,null,1,null,[],null,null,"","","CgtaVS1FWnl4ZTJEZyiGhb6RBg%3D%3D","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36,gzip(gfe)",1,"2.20220311.01.00","Windows","10.0",null,null,null,"https://www.youtube.com/sw.js_data",[],null,null,null,null,null,null,null,[],null,null,null,null,null,null,null,null,2,1,"vip\u003d142.250.190.14,server_port\u003d443,client_port\u003d57491,tcp_connection_request_count\u003d0,header_order\u003dHCUAREL,gfe_version\u003d2.764.3,ssl,ssl_info\u003dTLSv1.3:RNA:T,tlsext\u003dS,sni\u003dwww.youtube.com,hex_encoded_client_hello\u003d130213011303c02cc02bcca9c030cca8c02f009fccaa00a3009e00a2c024c028c023c027006b006a00670040c02ec032c02dc031c026c02ac025c029c00ac014c009c0130039003800330032c005c00fc004c00e009d009c003d003c0035002fc008c01200160013c003c00d000a00ff-00-00000005000a000b000d00320010001100170023002b002d0033,c\u003d1302,pn\u003dalpn,ja3\u003d2d5bd942ebf308df61e1572861d146f6,rtt_source\u003dh2_ping,rtt\u003d41,srtt\u003d80,client_protocol\u003dh2,client_transport\u003dtcp,gfe\u003dacorde13.prod.google.com,pzf\u003dWindows NT kernel [4:118+10:0:1460:mss*44/8:mss/nop/ws/nop/nop/sok:df/id+:0] [generic tos:0x20],vip_region\u003ddefault,asn\u003d10796,cc\u003dUS,eid\u003dhoIvYvCZG42E8wes0aCgCw,scheme\u003dhttps,location\u003dord37s32,metro\u003dORD",null,null,0,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,[null,null,null,null,null,"CIaFvpEGEP_3rQUQmOqtBRC3y60FEPX4rQUQ1vitBRDI0_0SENi-rQUQkfj8Eg%3D%3D"],null,null,2,[],null,null,null,null,null,null,null,null,null,null,null,null,null,"America/New_York",null,null,["US",2],null,null,[],"Chrome","99.0.4844.51",null,null,null,null,null,null,null,null,null,null,null,[],[]],null,[null,null,null,null,null,null,null,null,null,null,null,[],null,null,null,0],null,[null,null,null,null,null,null,1,null,null,null,null,null,null,null,[],null,[1647280774452302,176365571,201449727],null,null,null,null,null,null,null,null,[],null,[]],[null,"IhMIzqigi5fG9gIVAyCDCh3/4AEM"],null,null,null,null,null,[]],"AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",null,null,"","QUFFLUhqbnREclEzblJmc25GVF9XSXQ1dFZQSm9sRGlmQXxBQ3Jtc0tuU3huS1RoOHQyaFlqN0dLdm4wcGMweXp0OURWQU5RbEJKRko1TlhGYjBoZ3N1Nnpla3QxUFRkN19uaWxoQVZTV0FRUGh0cUw2ckRWbmh5bGhxYkRjNFc2cUREbjB4MnFxMEpval9HUXNZeWU5d1Ztaw\u003d\u003d","CgtaVS1FWnl4ZTJEZyiGhb6RBg%3D%3D"],"Vf114d778||"]]
** end of first response **
Edit: pasted original XML
First of, when asking questions, it is extremely helpful if the input data (XML) is working, otherwise its cubersome getting it to work.
In your case if I understand correctly, you need two kinds of function. The list(xml_element) returns all children of the element, in case of a list all items in case of an object all keys, values.
With element.find("name_of_subitem") you can access a specific element by name.
See working example below with a reduced version of your code.
s = """
<items>
<item>
<time>Mon Mar 14 13:59:37 EDT 2022</time>
<url><![CDATA[https://www.youtube.com]]></url>
<request base64="false">
</request>
<status>200</status>
<response base64="false">
<![CDATA[HTTP/2 204 No Content
Content-Type: text/html; charset=UTF-8
Strict-Transport-Security: max-age=31536000
Content-Security-Policy: object-src 'none';base-uri 'self';script-src 'nonce-9KQUw4dRjvKnx/zTrOblTQ==' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/cdt1
X-Xss-Protection: 0
X-Frame-Options: SAMEORIGIN
]]>
</response>
</item>
<item>
<time>Mon Mar 14 13:59:14 EDT 2022</time>
<url>
</url>
<request base64="false"></request>
<status>204</status>
<response base64="false">
<![CDATA[HTTP/2 204 No Content
Content-Type: text/html; charset=UTF-8
Strict-Transport-Security: max-age=31536000
Content-Security-Policy: object-src 'none';base-uri 'self';script-src 'nonce-9KQUw4dRjvKnx/zTrOblTQ==' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/cdt1
X-Xss-Protection: 0
X-Frame-Options: SAMEORIGIN
]]>
</response>
</item>
</items>
"""
import xml.etree.ElementTree as ET
items = ET.fromstring(s)
for item in list(items):
resp = item.find("response")
print(resp.text.strip()
Result
HTTP/2 204 No Content
Content-Type: text/html; charset=UTF-8
Strict-Transport-Security: max-age=31536000
Content-Security-Policy: object-src 'none';base-uri 'self';script-src 'nonce-9KQUw4dRjvKnx/zTrOblTQ==' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/cdt1
X-Xss-Protection: 0
X-Frame-Options: SAMEORIGIN
HTTP/2 204 No Content
Content-Type: text/html; charset=UTF-8
Strict-Transport-Security: max-age=31536000
Content-Security-Policy: object-src 'none';base-uri 'self';script-src 'nonce-9KQUw4dRjvKnx/zTrOblTQ==' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/cdt1
X-Xss-Protection: 0
X-Frame-Options: SAMEORIGIN
Related
Python - How to save output from loop to multiple callable variables
I have the following Python code where items is a string of joined XML data produced from two website requests/responses: items = ET.fromstring(new) for item in list(items): url = item.find("url") endpoint = url.text ## resp = item.find("response") response = resp.text responses = response.split("\n") index = responses.index('') indexed = responses[:index] print(endpoint, *indexed, sep = "\n") which prints: https://www.youtube.com/sw.js_data HTTP/2 200 OK Content-Type: application/json; charset=utf-8 X-Content-Type-Options: nosniff Cache-Control: no-cache, no-store, max-age=0, must-revalidate Pragma: no-cache Expires: Mon, 01 Jan 1990 00:00:00 GMT Date: Mon, 14 Mar 2022 17:59:34 GMT Content-Disposition: attachment; filename="response.bin"; filename*=UTF-8''response.bin Strict-Transport-Security: max-age=31536000 X-Frame-Options: SAMEORIGIN Cross-Origin-Opener-Policy-Report-Only: same-origin; report-to="ATmXEA_XZXH6CdbrmjUzyTbVgxu22C8KYH7NsxKbRt94" Permissions-Policy: ch-ua-arch=*, ch-ua-bitness=*, ch-ua-full-version=*, ch-ua-full-version-list=*, ch-ua-model=*, ch-ua-platform=*, ch-ua-platform-version=* Accept-Ch: Sec-CH-UA-Arch, Sec-CH-UA-Bitness, Sec-CH-UA-Full-Version, Sec-CH-UA-Full-Version-List, Sec-CH-UA-Model, Sec-CH-UA-Platform, Sec-CH-UA-Platform-Version Server: ESF X-Xss-Protection: 0 Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43" https://www.google.com/client_204?&atyp=i&biw=1440&bih=849&dpr=1.5&ei=Z4IvYpTtF5LU9AP1nIOICQ HTTP/2 204 No Content Content-Type: text/html; charset=UTF-8 Strict-Transport-Security: max-age=31536000 Content-Security-Policy: object-src 'none';base-uri 'self';script-src 'nonce-9KQUw4dRjvKnx/zTrOblTQ==' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/cdt1 Bfcache-Opt-In: unload Date: Mon, 14 Mar 2022 17:59:10 GMT Server: gws Content-Length: 0 X-Xss-Protection: 0 X-Frame-Options: SAMEORIGIN Set-Cookie: 1P_JAR=2022-03-14-17; expires=Wed, 13-Apr-2022 17:59:10 GMT; path=/; domain=.google.com; Secure; SameSite=none Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43" Basically, I would like to be able to individually evaluate the data that is produced from the above code to where I could check to ensure header values are in each response from the websites. So in this example, the code would check the set of headers produced from the first website first (youtube) and say, all headers look good. Then check the set of headers produced from the second website (google) and say, missing Strict-Transport-Security header (for example). The goal of this code is that it would be able to run validate through these website responses no matter how many are loaded into the initial string and tell me if any headers are missing. Is there an easy way to do this? I would think at some point each output (list of headers) from each website would be saved to variables that can be referenced/called? Maybe this is getting messy and will not be easy to do - not sure! Also happy to take any advice on making this code a little bit cleaner if there's a more efficient way to do what I am trying to do. Thank you! Full XML string below: <?xml version='1.0' encoding='utf8'?> <items burpVersion="2022.2.3" exportTime="Mon Mar 14 14:28:18 EDT 2022"> <item> <time>Mon Mar 14 13:59:37 EDT 2022</time> <url>https://www.youtube.com/sw.js_data</url> <host ip="142.250.190.142">www.youtube.com</host> <port>443</port> <protocol>https</protocol> <method>GET</method> <path>/sw.js_data</path> <extension>null</extension> <request base64="false">GET /sw.js_data HTTP/2 Host: www.youtube.com Accept: */* Sec-Fetch-Site: same-origin Sec-Fetch-Mode: cors Sec-Fetch-Dest: empty Referer: https://www.youtube.com/sw.js Accept-Encoding: gzip, deflate Accept-Language: en-US,en;q=0.9 </request> <status>200</status> <responselength>3524</responselength> <mimetype>JSON</mimetype> <response base64="false">HTTP/2 200 OK Content-Type: application/json; charset=utf-8 X-Content-Type-Options: nosniff Cache-Control: no-cache, no-store, max-age=0, must-revalidate Pragma: no-cache Expires: Mon, 01 Jan 1990 00:00:00 GMT Date: Mon, 14 Mar 2022 17:59:34 GMT Content-Disposition: attachment; filename="response.bin"; filename*=UTF-8''response.bin Strict-Transport-Security: max-age=31536000 X-Frame-Options: SAMEORIGIN Cross-Origin-Opener-Policy-Report-Only: same-origin; report-to="ATmXEA_XZXH6CdbrmjUzyTbVgxu22C8KYH7NsxKbRt94" Permissions-Policy: ch-ua-arch=*, ch-ua-bitness=*, ch-ua-full-version=*, ch-ua-full-version-list=*, ch-ua-model=*, ch-ua-platform=*, ch-ua-platform-version=* Accept-Ch: Sec-CH-UA-Arch, Sec-CH-UA-Bitness, Sec-CH-UA-Full-Version, Sec-CH-UA-Full-Version-List, Sec-CH-UA-Model, Sec-CH-UA-Platform, Sec-CH-UA-Platform-Version Server: ESF X-Xss-Protection: 0 Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43" )]}' [["yt.sw.adr",null,[[["en","US","US","75.188.116.252",null,null,1,null,[],null,null,"","",null,null,"","QUFFLUhqbnREclEzblJmc25GVF9XSXQ1dFZQSm9sRGlmQXxBQ3Jtc0tuU3huS1RoOHQyaFlqN0dLdm4wcGMweXp0OURWQU5RbEJKRko1TlhGYjBoZ3N1Nnpla3QxUFRkN19uaWxoQVZTV0FRUGh0cUw2ckRWbmh5bGhxYkRjNFc2cUREbjB4MnFxMEpval9HUXNZeWU5d1Ztaw\u003d\u003d","CgtaVS1FWnl4ZTJEZyiGhb6RBg%3D%3D"],"Vf114d778||"]]</response> <comment /> </item> <item> <time>Mon Mar 14 13:59:14 EDT 2022</time> <url>https://www.google.com/client_204?&atyp=i&biw=1440&bih=849&dpr=1.5&ei=Z4IvYpTtF5LU9AP1nIOICQ</url> <host ip="172.217.4.36">www.google.com</host> <port>443</port> <protocol>https</protocol> <method>GET</method> <path>/client_204?&atyp=i&biw=1440&bih=849&dpr=1.5&ei=Z4IvYpTtF5LU9AP1nIOICQ</path> <extension>null</extension> <request base64="false">GET /client_204?&atyp=i&biw=1440&bih=849&dpr=1.5&ei=Z4IvYpTtF5LU9AP1nIOICQ HTTP/2 Host: www.google.com Sec-Ch-Ua: "(Not(A:Brand";v="8", "Chromium";v="99" Sec-Ch-Ua-Mobile: ?0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Sec-Ch-Ua-Arch: "x86" Sec-Ch-Ua-Full-Version: "99.0.4844.51" Sec-Ch-Ua-Platform-Version: "10.0.0" Sec-Ch-Ua-Bitness: "64" Sec-Ch-Ua-Model: Sec-Ch-Ua-Platform: "Windows" Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8 X-Client-Data: CJDnygE= Sec-Fetch-Site: same-origin Sec-Fetch-Mode: no-cors Sec-Fetch-Dest: image Referer: https://www.google.com/ Accept-Encoding: gzip, deflate Accept-Language: en-US,en;q=0.9 </request> <status>204</status> <responselength>781</responselength> <mimetype /> <response base64="false">HTTP/2 204 No Content Content-Type: text/html; charset=UTF-8 Strict-Transport-Security: max-age=31536000 Content-Security-Policy: object-src 'none';base-uri 'self';script-src 'nonce-9KQUw4dRjvKnx/zTrOblTQ==' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/cdt1 Bfcache-Opt-In: unload Date: Mon, 14 Mar 2022 17:59:10 GMT Server: gws Content-Length: 0 X-Xss-Protection: 0 X-Frame-Options: SAMEORIGIN Set-Cookie: 1P_JAR=2022-03-14-17; expires=Wed, 13-Apr-2022 17:59:10 GMT; path=/; domain=.google.com; Secure; SameSite=none Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43" </response> <comment /> </item> </items> Update: have continued messing with the code for the past couple days with still no luck. Any and all thoughts welcome!
Simply save output to a single dictionary variable of many items. Because your text split requires multiple steps, consider a defined method. # DEFINED METHOD TO SPLIT RESPONSE BY LINE BREAKS def split_text(resp): responses = resp.split("\n") index = responses.index('') indexed = responses[:index] return indexed # PARSE XML FILE doc = ET.fromstring(new) # RETRIEVE ITEM NODES WITH DICTIONARY COMPREHENSION website_items = { item.find("url").text: split_text(item.find("response").text) for item in doc.findall(".//item") } # REVIEW SAVED DATA WITH URLS AS KEYS website_items["https://www.youtube.com/sw.js_data"] website_items["https://www.google.com/client_204?&atyp=i&biw=1440&bih=849&dpr=1.5&ei=Z4IvYpTtF5LU9AP1nIOICQ"]
Need help scraping tracking number details from UPS website with Python without API
I want to build a simple python program that gets updates for tracking numbers from UPS, I couldn't get an account number with them so I can't use their API. I decided to try web scraping. Here's an example of a tracking number: https://www.ups.com/track?loc=en_US&tracknum=1Z0X118AYW08592000&requester=WT/trackdetails I want to get the scheduled delivery date, the problem is that what the requests module scrapes and what shows when I view the page source doesn't get all the information inside a tag called app-root. That's where the delivery date is. I found a similar post that solves this problem with FedEx, but I can't get it to work with the ups website: Parsing HTML does not output desired data(tracking info for FedEx) I installed an extension called HTTP Trace that shows all the requests that go through my server, I can't find the one that matches UPS, this is what I got from the extension when I searched for the tracking number, any ideas what I can do here? https://wwwapps.ups.com/WebTracking/track?loc=en_IL HTMLVersion: 5.0 loc: en_IL track.x: Track trackNums: 1Z0X118AYW08592000 ups-search: 1Z0X118AYW08592000 POST https://wwwapps.ups.com/WebTracking/track?loc=en_IL Upgrade-Insecure-Requests: 1 Content-Type: application/x-www-form-urlencoded User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 HTTP/1.1 302 Moved Temporarily Redirect to: https://www.ups.com/track?loc=en_IL&tracknum=1Z0X118AYW08592000&requester=WT Server: Apache X-Frame-Options: SAMEORIGIN X-XSS-Protection: 1; mode=block X-Content-Type-Options: nosniff Strict-Transport-Security: max-age=31536000; includeSubDomains Cache-Control: no-store, no-cache Pragma: no-cache Location: https://www.ups.com/track?loc=en_IL&tracknum=1Z0X118AYW08592000&requester=WT Content-Length: 365 Content-Type: text/html Date: Thu, 14 Jan 2021 00:32:34 GMT Connection: keep-alive Server-Timing: cdn-cache; desc=MISS Server-Timing: edge; dur=164 Server-Timing: origin; dur=23 Debug-AK-TLS: No bypass GET https://www.ups.com/track?loc=en_IL&tracknum=1Z0X118AYW08592000&requester=WT Upgrade-Insecure-Requests: 1 Content-Type: application/x-www-form-urlencoded User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 HTTP/1.1 200 OK Server: Apache X-Frame-Options: SAMEORIGIN X-XSS-Protection: 1; mode=block X-Content-Type-Options: nosniff Strict-Transport-Security: max-age=31536000; includeSubDomains Cache-Control: no-store, no-cache Pragma: no-cache Content-Type: text/html; charset=utf-8 Content-Encoding: gzip Debug-AK-TLS: No bypass X-Akamai-Transformed: 9 9152 0 pmb=mTOE,1mRUM,1 Date: Thu, 14 Jan 2021 00:32:34 GMT Content-Length: 10947 Connection: keep-alive Vary: Accept-Encoding Server-Timing: cdn-cache; desc=MISS Server-Timing: edge; dur=182 Server-Timing: origin; dur=201 https://www.facebook.com/tr/?id=969628123173894&ev=PageView&dl=https%3A%2F%2Fwww.ups.com%2Ftrack%3Floc%3Den_IL%26tracknum%3D1Z0X118AYW08592000%26requester%3DWT%2Ftrackdetails&rl=https%3A%2F%2Fwww.ups.com%2F&if=false&ts=1610584355509&sw=1920&sh=1080&v=2.9.32&r=stable&a=tmtealium&ec=0&o=30&fbp=fb.1.1598067407332.38393503&it=1610584355413&coo=false&dpo=LDU&dpoco=0&dpost=0&rqm=GET GET https://www.facebook.com/tr/?id=969628123173894&ev=PageView&dl=https%3A%2F%2Fwww.ups.com%2Ftrack%3Floc%3Den_IL%26tracknum%3D1Z0X118AYW08592000%26requester%3DWT%2Ftrackdetails&rl=https%3A%2F%2Fwww.ups.com%2F&if=false&ts=1610584355509&sw=1920&sh=1080&v=2.9.32&r=stable&a=tmtealium&ec=0&o=30&fbp=fb.1.1598067407332.38393503&it=1610584355413&coo=false&dpo=LDU&dpoco=0&dpost=0&rqm=GET User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Accept: image/avif,image/webp,image/apng,image/*,*/*;q=0.8 HTTP/1.1 302 Redirect to: https://cx.atdmt.com/?c=1479770850078954307&f=AYzL_IHfyiIJ9HIa7oqq8XcmRPtLo6M0aKkForULuTS_d5qgkpmUtO1x4Rmi3jkdZ4EPRHG7qxKZDTiWb-BA5MYf&id=969628123173894&l=3&v=0 cache-control: no-cache, no-store, must-revalidate pragma: no-cache expires: 0 date: Thu, 14 Jan 2021 00:32:36 GMT location: https://cx.atdmt.com/?c=1479770850078954307&f=AYzL_IHfyiIJ9HIa7oqq8XcmRPtLo6M0aKkForULuTS_d5qgkpmUtO1x4Rmi3jkdZ4EPRHG7qxKZDTiWb-BA5MYf&id=969628123173894&l=3&v=0 content-type: text/plain content-length: 0 server: proxygen-bolt alt-svc: h3-29=":443"; ma=3600,h3-27=":443"; ma=3600 GET https://cx.atdmt.com/?c=1479770850078954307&f=AYzL_IHfyiIJ9HIa7oqq8XcmRPtLo6M0aKkForULuTS_d5qgkpmUtO1x4Rmi3jkdZ4EPRHG7qxKZDTiWb-BA5MYf&id=969628123173894&l=3&v=0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Accept: image/avif,image/webp,image/apng,image/*,*/*;q=0.8 HTTP/1.1 200 x-fb-rlafr: 0 content-type: image/gif date: Wed, 13 Jan 2021 16:32:37 PST x-content-type-options: nosniff report-to: {"group":"coep_report","max_age":86400,"endpoints":[{"url":"https:\/\/www.facebook.com\/browser_reporting\/"}]} cache-control: public, max-age=0 content-encoding: br x-frame-options: DENY cross-origin-resource-policy: cross-origin expires: Wed, 13 Jan 2021 16:32:37 PST vary: Accept-Encoding cross-origin-embedder-policy-report-only: require-corp;report-to="coep_report" pragma: public x-fb-debug: ySiesinmQSMWtIWGg5+rMp+g66R70GGiqJJC3M0DowZMGuFf14OidRiX02DfG99gXxjUSjCaEtHosxh/9tl/hQ==
I honestly do not believe this is possible. I checked how UPS loads its sites, and it seems to load the frontend first like this Get request to website preview then goes in to the api to grab the dates. For example, the delivered on date is stored in this api link ("https://www.ups.com/track/api/Track/GetStatus?loc=en_US") which needs a bunch of headers and has some akamai/security cookies (which may prevent you from scraping it). If you really do not want to use an api, I would suggest using something like Selenium if you do not need it to be quick/do not have many links to work with.
Your only choice is either Selenium OR API but that comes with a hitch. For what I can tell on the UPS website, their API only allows for queries at night. They only want "emergencies" to be hitting their API, which is preposterous since I would imagine web requests are hitting the same API.
Connect to websocket with cloudflare protection on python
The essence of the problem is that I used to connect to websocket by sending Origin, User-Agent, Cookies and the connection worked, now the domain owner decided to change it to the domain of the websocket and put cloudflare protection there, after which my connection method does not work . Advise some method, or information on how to connect to a web socket with cloudflare. Help me pls!! Example of my code: import websocket import json import time import traceback headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 OPR/68.0.3618.173', 'cookie': '__cfduid=da97b059db0292806e2affdf9c3f4fd8b1593022325; _csrf=i8W6njc7hUXMOf4iQjiAxKg1; language=en; theme=darkTheme; pro_version=false; csgo_ses=1489162147d69debd9fe5d0ea2e445c87a117578d774502172d7151b89b82f7f; steamid=76561199068891508; avatar=https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/fe/fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb_medium.jpg; username=andrewcrook232; thirdparty_token=06d04856ce6e334aa1368696df775e7ba0b1b898db135b0af0b5dc0fe001dd55; user_type=old; sellerid=6721648; type_device=desktop', 'origin': 'https://cs.money'} def start_ws(): try: ws = websocket.WebSocketApp("wss://ws.cs.money/ws", on_message = on_message, cookie = json.dumps(headers)) print("Connected") while True: ws.run_forever(ping_timeout=20) print("Reload") time.sleep(20) except: print(traceback.format_exc()) def on_message(ws, message): try: print(message) except: print(traceback.format_exc()) if __name__ == "__main__": start_ws() Below is all the information that I got with Chrome Inspector (f12) -> Network -> WS -> headers, this information should be more than enough to successfully join WSS. Request URL: wss://ws.cs.money/ws Request Method: GET Status Code: 101 Switching Protocols alt-svc: h3-27=":443"; ma=86400, h3-28=":443"; ma=86400, h3-29=":443"; ma=86400 CF-Cache-Status: DYNAMIC CF-RAY: 5a886ad37f4b8ac6-KBP cf-request-id: 038921182700008ac6798a2200000001 Connection: upgrade Date: Wed, 24 Jun 2020 18:12:29 GMT Expect-CT: max-age=604800, report-uri="https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct" Sec-WebSocket-Accept: zrH4CEKXm3BY5z77HroJDqGgYSc= Server: cloudflare Strict-Transport-Security: max-age=31536000; includeSubDomains; preload Upgrade: websocket X-Content-Type-Options: nosniff Accept-Encoding: gzip, deflate, br Accept-Language: ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7 Cache-Control: no-cache Connection: Upgrade Host: ws.cs.money Origin: https://cs.money Pragma: no-cache Sec-WebSocket-Extensions: permessage-deflate; client_max_window_bits Sec-WebSocket-Key: GXVT8QewAgPEZDEZZ+x3dA== Sec-WebSocket-Version: 13 Upgrade: websocket User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 OPR/68.0.3618.173 Also additional page data: Request URL: https://cs.money/ Request Method: GET Status Code: 200 Remote Address: 104.20.76.156:443 Referrer Policy: no-referrer-when-downgrade alt-svc: h3-27=":443"; ma=86400, h3-28=":443"; ma=86400, h3-29=":443"; ma=86400 cf-cache-status: DYNAMIC cf-ray: 5a886ab5adac8aea-KBP cf-request-id: 038921058800008aea96109200000001 content-encoding: br content-security-policy: script-src 'self' cs.money dev.csgo.trade gleam.io www.am4charts.com translate.google.com translate.googleapis.com www.googletagmanager.com www.googleoptimize.com www.google-analytics.com connect.facebook.net https://vk.com 'unsafe-inline' top-fwz1.mail.ru 'unsafe-eval' api.usersnap.com cdn.usersnap.com cs.money mc.yandex.ru diffuser-cdn.app-us1.com diffuser-cdn.app-us1.com prism.app-us1.com trackcmp.net api.basisid.com https://cdn.amplitude.com sc-static.net support.cs.money embed-sandbox.bridgerpay.com embed.bridgerpay.com cs.money; worker-src 'self' data: blob: cs.money; object-src cs.money dota.money; media-src cs.money dota.money; frame-src cs.money dota.money onesignal.com https://*.com https://*.ru https://*.ua http://www.youtube.com content-type: text/html; charset=utf-8 date: Wed, 24 Jun 2020 18:12:25 GMT expect-ct: max-age=604800, report-uri="https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct" server: cloudflare set-cookie: user_type=old; Path=/ set-cookie: language=en; Max-Age=8640000; Domain=cs.money; Path=/; Expires=Fri, 02 Oct 2020 18:12:25 GMT set-cookie: language=en; Max-Age=8640000; Domain=.cs.money; Path=/; Expires=Fri, 02 Oct 2020 18:12:25 GMT set-cookie: sellerid=6721648; Max-Age=8640000; Domain=cs.money; Path=/; Expires=Fri, 02 Oct 2020 18:12:25 GMT set-cookie: pro_version=false; Max-Age=8640000; Domain=cs.money; Path=/; Expires=Fri, 02 Oct 2020 18:12:25 GMT status: 200 strict-transport-security: max-age=31536000; includeSubDomains; preload x-cache-status: BYPASS x-content-type-options: nosniff x-dns-prefetch-control: off x-download-options: noopen x-frame-options: SAMEORIGIN x-powered-by: PHP 4.1.0 x-xss-protection: 1; mode=block :authority: cs.money :method: GET :path: / :scheme: https accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 accept-encoding: gzip, deflate, br accept-language: ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7 cache-control: max-age=0 cookie: __cfduid=da97b059db0292806e2affdf9c3f4fd8b1593022325; _csrf=i8W6njc7hUXMOf4iQjiAxKg1; language=en; theme=darkTheme; pro_version=false; csgo_ses=1489162147d69debd9fe5d0ea2e445c87a117578d774502172d7151b89b82f7f; steamid=76561199068891508; avatar=https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/fe/fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb_medium.jpg; username=andrewcrook232; thirdparty_token=06d04856ce6e334aa1368696df775e7ba0b1b898db135b0af0b5dc0fe001dd55; user_type=old; sellerid=6721648; type_device=desktop referer: https://steamcommunity.com/openid/login?openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.return_to=https%3A%2F%2Fauth.dota.trade%2Flogin%2Fcallback%3FredirectUrl%3Dhttps%3A%2F%2Fcs.money%26callbackUrl%3Dhttps%3A%2F%2Fcs.money%2Flogin&openid.realm=https%3A%2F%2Fauth.dota.trade sec-fetch-dest: document sec-fetch-mode: navigate sec-fetch-site: cross-site sec-fetch-user: ?1 upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 OPR/68.0.3618.173
I'm not sure about the real reason, but it seems that your code has some bug. If you need to build a websocket connection with customized header, you pass it to header parameter, instead of json dump it. ws = websocket.WebSocketApp("wss://ws.cs.money/ws", on_message = on_message, cookie = json.dumps(headers)) should be cookie_string = headers['cookie'] del headers['cookie'] header_without_cookie = headers ws = websocket.WebSocketApp("wss://ws.cs.money/ws", on_message = on_message, header = header_without_cookie, cookie = cookie_string) websocket-client documentation is missing, maybe you can read source code about usage https://github.com/websocket-client/websocket-client/blob/2222f2c49d71afd74fcda486e3dfd14399e647af/websocket/_app.py
HEAD request in python not working as desired
I am trying to check the status code of any URL in Python using the following code class HeadRequest(urllib2.Request): def get_method(self): return "HEAD" when I use it like this: response = urllib2.urlopen(HeadRequest("http://www.nativeseeds.org/")) it throws following exception: HTTPError: HTTP Error 503: Service Temporarily Unavailable However when I open the above URL "http://www.nativeseeds.org/" in firefox/restclient, it returns 200 status code. Any help will be highly appreciated.
After some investigating, the website requires that both Accept and User-Agent request headers are set. Otherwise, it returns a 503. This is terribly broken. It also appears to be doing user-agent sniffing. I get a 403 when using curl: $ curl --head http://www.nativeseeds.org/ HTTP/1.1 403 Forbidden Date: Wed, 26 Sep 2012 14:54:59 GMT Server: Apache P3P: CP="NOI ADM DEV PSAi COM NAV OUR OTRo STP IND DEM" Set-Cookie: f65129b0cd2c5e10c387f919ac90ad66=PjZxNjvNmn6IlVh4Ac-tH0; path=/ Vary: Accept-Encoding Content-Type: text/html but works fine if I set the user-agent to Firefox: $ curl --user-agent "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)" --head http://www.nativeseeds.org/ HTTP/1.1 200 OK Date: Wed, 26 Sep 2012 14:55:57 GMT Server: Apache P3P: CP="NOI ADM DEV PSAi COM NAV OUR OTRo STP IND DEM" Expires: Mon, 1 Jan 2001 00:00:00 GMT Cache-Control: post-check=0, pre-check=0 Pragma: no-cache Set-Cookie: f65129b0cd2c5e10c387f919ac90ad66=ykOpGnEE%2CQOMUaVJLnM7W0; path=/ Last-Modified: Wed, 26 Sep 2012 14:56:27 GMT Vary: Accept-Encoding Content-Type: text/html; charset=utf-8 It appears to work using the requests module: >>> import requests >>> r = requests.head('http://www.nativeseeds.org/') >>> import pprint; pprint.pprint(r.headers) {'cache-control': 'post-check=0, pre-check=0', 'content-encoding': 'gzip', 'content-length': '20', 'content-type': 'text/html; charset=utf-8', 'date': 'Wed, 26 Sep 2012 14:58:05 GMT', 'expires': 'Mon, 1 Jan 2001 00:00:00 GMT', 'last-modified': 'Wed, 26 Sep 2012 14:58:09 GMT', 'p3p': 'CP="NOI ADM DEV PSAi COM NAV OUR OTRo STP IND DEM"', 'pragma': 'no-cache', 'server': 'Apache', 'set-cookie': 'f65129b0cd2c5e10c387f919ac90ad66=2NtRrDBra9jPsszChZXDm2; path=/', 'vary': 'Accept-Encoding'}
The problem you see has nothing to do with Python. The website itself seems to require something more than just a HEAD request. Even a simple telnet session results in the error: $ telnet www.nativeseeds.org 80 Trying 208.113.230.85... Connected to www.nativeseeds.org (208.113.230.85). Escape character is '^]'. HEAD / HTTP/1.1 Host: www.nativeseeds.org HTTP/1.1 503 Service Temporarily Unavailable Date: Wed, 26 Sep 2012 14:29:33 GMT Server: Apache Vary: Accept-Encoding Connection: close Content-Type: text/html; charset=iso-8859-1 Try adding some more headers; the http command line client does get a 200 response: $ http -v head http://www.nativeseeds.org HEAD / HTTP/1.1 Host: www.nativeseeds.org Content-Type: application/x-www-form-urlencoded; charset=utf-8 Accept-Encoding: identity, deflate, compress, gzip Accept: */* User-Agent: HTTPie/0.2.2 HTTP/1.1 200 OK Date: Wed, 26 Sep 2012 14:33:21 GMT Server: Apache P3P: CP="NOI ADM DEV PSAi COM NAV OUR OTRo STP IND DEM" Expires: Mon, 1 Jan 2001 00:00:00 GMT Cache-Control: post-check=0, pre-check=0 Pragma: no-cache Set-Cookie: f65129b0cd2c5e10c387f919ac90ad66=34hOijDSzeskKYtULx9V83; path=/ Last-Modified: Wed, 26 Sep 2012 14:33:23 GMT Vary: Accept-Encoding Content-Encoding: gzip Content-Length: 20 Content-Type: text/html; charset=utf-8
Reading urllib2 docs, get_method only returns 'GET' or 'POST'. You may be interested in this.
mechanize stuck on page loading till timeout while regular browsers work fine, no javascript/ajax
Using python and mechanize im trying to login to web-site. The code i'm using is: from mechanize import Browser def calOnline(uname,passwd): br = Browser() br.set_debug_http(True) br.set_debug_redirects(True) br.set_debug_responses(True) br.set_handle_redirect(True) br.set_handle_robots(False) br.set_handle_refresh(True) #~ br.encoding() br.addheaders = [('User-Agent','Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0.1'), ('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Language','en-us,en;q=0.5'), ('Accept-Encoding','gzip, deflate'), ('Connection','keep-alive')] print('opening site') br.open('https://m.cal-online.co.il/') print('\n\nsubmiting first form') br.select_form(nr=0) br.submit() print('\n\nselecting login form') br.select_form(nr=0) br['ctl00$cphMain$LGN$UserName'] = uname br['ctl00$cphMain$LGN$Password'] = passwd print('\n\nsubmitting form') br.submit(name='ctl00$cphMain$LGN$LoginButton') print('\n\nOpening details page') res = br.open('https://m.cal-online.co.il/SCREENS/Transactions/TrSearch.aspx') print res.read() uname = 'someuname' passwd = 'somepasswd' a = calOnline(uname, passwd) print a.read() The website is mobile version of credit card website from which im trying to get my expenses. Browser (Android as mobile or Firefox/Chrome/Opera as desktop) opens the site and logs in without any issue. While trying to login programatically it just stuck, here is debug log with iOS user-agent: opening site send: 'GET / HTTP/1.1\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip, deflate\r\nHost: m.cal-online.co.il\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16\r\nConnection: close\r\n\r\n' reply: 'HTTP/1.1 200 OK\r\n' header: Server: Apache-Coyote/1.1 header: X-Powered-By: ASP.NET header: Set-Cookie: JSESSIONID=E46452F3D9D4B7303C6E93F04BE54449; Path=/; Secure header: X-MA-MIS-Device: root^html^mozilla/5^safari^appleiphone^appleiphone(os_3.0) header: Server: Microsoft-IIS/6.0 header: X-AspNet-Version: 2.0.50727 header: Date: Fri, 29 Jun 2012 21:44:52 GMT header: Cache-Control: no-cache, no-store, must-revalidate, no-transform header: Pragma: no-cache header: Expires: -1 header: Content-Type: text/html;charset=utf-8 header: Content-Length: 302 header: Connection: close send: 'GET /SCREENS/AccountManagement/HomePage.aspx HTTP/1.1\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip, deflate\r\nHost: m.cal-online.co.il\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16\r\nConnection: close\r\nCookie: JSESSIONID=E46452F3D9D4B7303C6E93F04BE54449\r\n\r\n' reply: 'HTTP/1.1 302 Moved Temporarily\r\n' header: Server: Apache-Coyote/1.1 header: X-Powered-By: ASP.NET header: X-MA-MIS-Device: root^html^mozilla/5^safari^appleiphone^appleiphone(os_3.0) header: Server: Microsoft-IIS/6.0 header: X-AspNet-Version: 2.0.50727 header: Location: https://m.cal-online.co.il/SCREENS/AccountManagement/Opening.aspx?ReturnUrl=%2fSCREENS%2fAccountManagement%2fHomePage.aspx header: Content-Length: 0 header: Date: Fri, 29 Jun 2012 21:44:53 GMT header: Connection: close send: 'GET /SCREENS/AccountManagement/Opening.aspx?ReturnUrl=%2fSCREENS%2fAccountManagement%2fHomePage.aspx HTTP/1.1\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip, deflate\r\nHost: m.cal-online.co.il\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16\r\nConnection: close\r\nCookie: JSESSIONID=E46452F3D9D4B7303C6E93F04BE54449\r\n\r\n' reply: 'HTTP/1.1 200 OK\r\n' header: Server: Apache-Coyote/1.1 header: X-Powered-By: ASP.NET header: X-MA-MIS-Device: root^html^mozilla/5^safari^appleiphone^appleiphone(os_3.0) header: Server: Microsoft-IIS/6.0 header: X-AspNet-Version: 2.0.50727 header: Date: Fri, 29 Jun 2012 21:44:55 GMT header: Cache-Control: private header: Content-Type: text/html;charset=utf-8 header: Content-Length: 1256 header: Connection: close send: 'GET /SCREENS/AccountManagement/Login.aspx HTTP/1.1\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip, deflate\r\nHost: m.cal-online.co.il\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16\r\nConnection: close\r\nCookie: JSESSIONID=E46452F3D9D4B7303C6E93F04BE54449\r\n\r\n' reply: 'HTTP/1.1 302 Moved Temporarily\r\n' header: Server: Apache-Coyote/1.1 header: X-Powered-By: ASP.NET header: X-MA-MIS-Device: root^html^mozilla/5^safari^appleiphone^appleiphone(os_3.0) header: Server: Microsoft-IIS/6.0 header: X-AspNet-Version: 2.0.50727 header: Location: https://m.cal-online.co.il/SCREENS/AccountManagement/Login.aspx?cc=c009a07&rnd=2103197098 header: Set-Cookie: test_cookie=ok; expires=Sat, 30-Jun-2012 21:45:12 GMT; path=/ header: Content-Length: 0 header: Date: Fri, 29 Jun 2012 21:45:07 GMT header: Connection: close send: 'GET /SCREENS/AccountManagement/Login.aspx?cc=c009a07&rnd=2103197098 HTTP/1.1\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip, deflate\r\nHost: m.cal-online.co.il\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16\r\nConnection: close\r\nCookie: test_cookie=ok; JSESSIONID=E46452F3D9D4B7303C6E93F04BE54449\r\n\r\n' reply: 'HTTP/1.1 200 OK\r\n' header: Server: Apache-Coyote/1.1 header: X-Powered-By: ASP.NET header: X-MA-MIS-Device: root^html^mozilla/5^safari^appleiphone^appleiphone(os_3.0) header: Server: Microsoft-IIS/6.0 header: X-AspNet-Version: 2.0.50727 header: Set-Cookie: ASP.NET_SessionId=clocqc55tdzykh45zql10045; path=/; HttpOnly header: Date: Fri, 29 Jun 2012 21:45:09 GMT header: Cache-Control: no-cache, no-store, must-revalidate, no-transform header: Pragma: no-cache header: Expires: -1 header: Content-Type: text/html;charset=utf-8 header: Content-Length: 3153 header: Connection: close submiting first form send: 'POST /SCREENS/AccountManagement/Login.aspx?rnd=2103197098&cc=c009a07 HTTP/1.1\r\nContent-Length: 189\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16\r\nHost: m.cal-online.co.il\r\nReferer: https://m.cal-online.co.il/\r\nCookie: test_cookie=ok; ASP.NET_SessionId=clocqc55tdzykh45zql10045; JSESSIONID=E46452F3D9D4B7303C6E93F04BE54449\r\nContent-Type: application/x-www-form-urlencoded\r\n\r\n__VIEWSTATE=&ctl00%24__MATRIX_VIEWSTATE=1&ctl00%24cphMain%24LGN%24UserName=&ctl00%24cphMain%24LGN%24Password=&ctl00%24cphMain%24LGN%24LoginButton.x=1&ctl00%24cphMain%24LGN%24LoginButton.y=1' reply: 'HTTP/1.1 200 OK\r\n' header: Server: Apache-Coyote/1.1 header: X-Powered-By: ASP.NET header: X-MA-MIS-Device: root^html^mozilla/5^safari^appleiphone^appleiphone(os_3.0) header: Server: Microsoft-IIS/6.0 header: X-AspNet-Version: 2.0.50727 header: Date: Fri, 29 Jun 2012 21:45:10 GMT header: Cache-Control: no-cache, no-store, must-revalidate, no-transform header: Pragma: no-cache header: Expires: -1 header: Content-Type: text/html;charset=utf-8 header: Content-Length: 3210 header: Connection: close selecting login form submitting form send: 'POST /SCREENS/AccountManagement/Login.aspx?rnd=2103197098&cc=c009a07 HTTP/1.1\r\nContent-Length: 206\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16\r\nHost: m.cal-online.co.il\r\nReferer: https://m.cal-online.co.il/SCREENS/AccountManagement/Login.aspx?rnd=2103197098&cc=c009a07\r\nCookie: test_cookie=ok; ASP.NET_SessionId=clocqc55tdzykh45zql10045; JSESSIONID=E46452F3D9D4B7303C6E93F04BE54449\r\nContent-Type: application/x-www-form-urlencoded\r\n\r\n__VIEWSTATE=&ctl00%24__MATRIX_VIEWSTATE=2&ctl00%24cphMain%24LGN%24UserName=<uname>&ctl00%24cphMain%24LGN%24Password=<passwd>&ctl00%24cphMain%24LGN%24LoginButton.x=1&ctl00%24cphMain%24LGN%24LoginButton.y=1' reply: 'HTTP/1.1 302 Moved Temporarily\r\n' header: Server: Apache-Coyote/1.1 header: X-Powered-By: ASP.NET header: X-MA-MIS-Device: root^html^mozilla/5^safari^appleiphone^appleiphone(os_3.0) header: Server: Microsoft-IIS/6.0 header: X-AspNet-Version: 2.0.50727 header: Location: https://m.cal-online.co.il/SCREENS/AccountManagement/HomePage.aspx header: Set-Cookie: .ASPXAUTH=478FDDCD007398FEB264895D0F6EDB51B391DD0F5FBA3C71FC6A9E747AF3A97E6382E7B939614DFC07B25A1D4A641ED121F15508483A676AC49BAA550BEADF382F93792E849F63E99B03FA45143391ACD5E18CA7124FAC43AC378D16703DB5B2A374E4D1B3278BF9B886F3B4A41BB12E3569162D; path=/; HttpOnly header: Content-Length: 0 header: Date: Fri, 29 Jun 2012 21:45:14 GMT header: Connection: close send: 'GET /SCREENS/AccountManagement/HomePage.aspx HTTP/1.1\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16\r\nHost: m.cal-online.co.il\r\nCookie: test_cookie=ok; ASP.NET_SessionId=clocqc55tdzykh45zql10045; .ASPXAUTH=478FDDCD007398FEB264895D0F6EDB51B391DD0F5FBA3C71FC6A9E747AF3A97E6382E7B939614DFC07B25A1D4A641ED121F15508483A676AC49BAA550BEADF382F93792E849F63E99B03FA45143391ACD5E18CA7124FAC43AC378D16703DB5B2A374E4D1B3278BF9B886F3B4A41BB12E3569162D; JSESSIONID=E46452F3D9D4B7303C6E93F04BE54449\r\nReferer: https://m.cal-online.co.il/SCREENS/AccountManagement/Login.aspx?rnd=2103197098&cc=c009a07\r\n\r\n' reply: 'HTTP/1.1 200 OK\r\n' header: Server: Apache-Coyote/1.1 header: X-Powered-By: ASP.NET header: X-MA-MIS-Device: root^html^mozilla/5^safari^appleiphone^appleiphone(os_3.0) header: Server: Microsoft-IIS/6.0 header: X-AspNet-Version: 2.0.50727 header: Date: Fri, 29 Jun 2012 21:45:16 GMT header: Cache-Control: no-cache, no-store, must-revalidate, no-transform header: Pragma: no-cache header: Expires: -1 header: Content-Type: text/html;charset=utf-8 header: Content-Length: 5235 header: Connection: close Same stuff happens with my Firefox User-agent. It just getting stuck on this moment till timeout on website which in place redirect me to LogOut page due to inactivity. Is there something I'm missing out? Thanks in advance