python mechanize returns wrong 302 location - python

I am trying to do some automation for this site http://www.beistle.com/ but for visiting the search page i need pid which is generated on the server.
I looked for a response the browser makes and tried to do the same with mechanize in python.
import mechanize;
import cookielib;
import urllib2;
import lxml.html;
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(False)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'),('Referer', 'http://www.beistle.com/'), ('Connection', 'keep-alive'), ('Host','www.beistle.com'), ('Accept-Language', 'en-US,en;q=0.5'), ('Accept-Encoding', 'gzip, deflate'), ('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') ]
br.set_debug_http(True)
br.set_debug_redirects(True)
br.set_debug_responses(True)
br.open("http://www.beistle.com/");
br.select_form(name='aspnetForm');
br.submit();
In browser it returns 302 with correct location in which the pid is written. However mechanize returns "Location: /Search.aspx" which can't be used for searching

Related

How to use Internet explorer as browser in mchanize

import mechanize
import cookielib
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=5)
br.addheaders = [('User-agent', 'Chrome')]
url1 = 'www.google.com'
br.open(url1)
here I tried to open the link with chrome. if I want to use internet explorer what changes I have to made here.
IE is worked on Trident (MSHTML) browser engine. Please try this code.
import sys
from mechanize import Browser
br = Browser()
br.set_handle_robots( False )
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko')]
url = str(sys.argv[1])
filename = str(sys.argv[2])
f = br.retrieve(url,filename)[0]
Also, of course, you can open your IE, and get UserAgent string in JavaScript console. Open Developer Tools (Ctrl + Shift + j, F12 on different machines) and run in JS console this code: navigator.userAgent;.

How can I navigate a site after logging in

I have used mechanize and successfully logged into a user login page. Now I want to navigate the site to a specific page in the submenus. When I try this by opening the URL of the specific page after logging in, another login page comes up which I do not have a username and password for. This log in page does not usually show up when I am navigating the site on a web browser.
How can I do this?
import mechanize
import webbrowser
import cookielib
usern = '****'
passw = '****'
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
r = br.open("https://myunihub-1.mdx.ac.uk/cas-web/login?service=https%3A%2F%2Fmyunihub.mdx.ac.uk%2Fc%2Fportal%2Flogin")
br.select_form(nr=0)
br.form['username'] = usern
br.form['password'] = passw
br.set_cookiejar(cj)
br.submit
url = "https://misis.mdx.ac.uk/mislve/bwskfshd.P_CrseSchd"
webbrowser.open_new(url)
Try to use cookies and pretend to be actual browser. Some sites doesn't allow automated scripts/robots to crawl their sites. But you can always tell them no no I'm actual browser.
import cookielib
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
And let's pretend we are not a robot and a actual browser.
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

How can i get fully loaded html through python-mechanize?

Hi I'm using python mechanize to get datas from webpages.
I'm trying to get imgurl from google image search webpage to download search result images.
Here's my code
I fill search form as 'dog' and submit. (search 'dog')
import mechanize
import cookielib
import urllib2
import urllib
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time = 1)
br.addheaders = [('User-agent', 'Mozilla/5.0 (x11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'), ('Accept', '*/*') ,('Accept-Language', 'ko-KR')]
br.open('http://www.google.com/imghp?hl=en')
br.select_form(nr=0)
br.form['q'] = 'dog'
a = br.submit()
searched_url = br.geturl()
file0 = open("1.html", "wb")
file0.write(a.read())
file0.close()
when i see page-source from chrome browser, there are 'imgurl's in pagesource. But when i read data from
python mechanize, there's no such things.
also, the size of 1.html(which i write by python) is much smaller than html file downloaded from chrome.
How can i get exactly same html data as web-browsers by using python?
Do i have to set request headers same as web-browsers?
thanks

Python Mechanize HTTP Error 403: request disallowed by robots.txt [duplicate]

This question already has answers here:
Screen scraping: getting around "HTTP Error 403: request disallowed by robots.txt"
(8 answers)
Closed 8 years ago.
So, I created a Django website to web-scrape news webpages for articles..
Even though i use mechanize, i they still telling me:
HTTP Error 403: request disallowed by robots.txt
I tried everything, look at my code(Just the part to scrape):
br = mechanize.Browser()
page = br.open(web)
br.set_handle_robots(False)
br.set_handle_equiv(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
#BeautifulSoup
htmlcontent = page.read()
soup = BeautifulSoup(htmlcontent)
I tried too to use de br.open before the set_hande_robots(Flase) ,etc. It didn't work either.
Any way to get trough this sites?
You're setting br.set_handle_robots(False) after br.open()
It should be:
br = mechanize.Browser()
br.set_handle_robots(False)
br.set_handle_equiv(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
page = br.open(web)
htmlcontent = page.read()
soup = BeautifulSoup(htmlcontent)

scraping way2sms with mechanize

I am trying to send an sms with by scraping way2sms.com, but I am unable to login into way2sms.com using mechanize.
I am using following code to submit the login form.
import mechanize
br = mechanize.Browser()
br.set_handle_robots(False)
br.set_handle_refresh(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:18.0) Gecko/20100101 Firefox/18.0')]
res=br.open('http://wwwa.way2sms.com/content/prehome.jsp')
link=list(br.links())[5]
res=br.follow_link(link)
br.form = list(br.forms())[0]
br.form.find_control('username').value=USERNAME #user name
br.form.find_control('password').value=PASSWORD #password
res=br.submit()
After submitting the form, again the login page is received.
Just replace username and password with your username and password.
import mechanize
import cookielib
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
url = 'http://site25.way2sms.com/content/index.html?'
#Opening WEbsite
op = br.open(url)
#Selection form
br.select_form(nr=0)
username = 'mobilenumberhere'
password = 'passwordhere'
#Give username and password
br.form['username'] = username
br.form['password'] = password
br.submit()
#To check whether log in Successful or not
if username in br.geturl():
print "Login Failed" # Go to way2sms and enter wrong details. You will understand this.
else:
print "Login Successful. You are at ", br.geturl()

Categories