Following link with href = # in python mechanize - python

trying to click the "next" button on the following page, with the ultimate goal of cycling through pages 2-8 using python + mechanize.
https://www.ncbi.nlm.nih.gov/pubmed/?term=shi+LL
I'm using the following code:
import mechanize
import cookielib
from bs4 import BeautifulSoup
import urllib
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Want debugging messages?
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open("https://www.ncbi.nlm.nih.gov/pubmed/?term=shi+LL")
request = br.click_link(link)
response = br.follow_link(link)
print response.geturl()
But I don't know what to put in the "link" variable since the next button has an href = #, and there are multiple items on the same page with the same href...
This is html corresponding to the next button at the top of the page:
<a name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="2" accesskey="k" id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Pager.Page">Next ></a>
I've also tried to cycle through the pages by inserting numbers 2-8 at the text control at the top of the page with no luck considering there is no submit button/search button anywhere.
Any ideas?

Related

to extract hindi,tamil,punjabi(Indian languages) post from a social networking site

i am using python and beautiful soup..trying to extract hindi,tamil,punjabi(Indian languages) post from a social networking site with the help of cookies..i am bale to extract but the extract is not in that language itself rather is in some encoded form ..i want it in the same language..eg:hindi post should be extracted the same in hindi only..
import mechanize
import cookielib
from bs4 import BeautifulSoup
import urllib2
import csv
from html2text import html2text
import re
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
urls = []
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'),('Connection','keep-alive'),('Accept','application/json, text/javascript, */*; q=0.01'),('Accept-Encoding','gzip, deflate, sdch'),('Host','link'),('Referer','https://link/'),('X-Requested-With','XMLHttpRequest'),('Accept-Language','en-US,en;q=0.8')]
br.open('https://link')
br._factory.is_html = True
# Select the first (index zero) form
#br.select_form(predicate=lambda f: f.attrs.get('id', None) == 'login_form')
br.select_form(nr=0)
# User credentials
br.form['USER'] = 'username'
br.form['PASSWORD'] = 'password'
# Login
br.submit()
soup = BeautifulSoup(br.response().read())
for tag in soup.find_all("div", re.compile("classname")):
#print tag
for tag1 in tag.find_all(re.compile("^p")):
print tag1
output sample:
\u0baa\u0b9f\u0bbf\u0ba4\u0bcd\u0ba4\u0ba4\u0bbf\u0bb2\u0bcd \u0baa\u0bbf\u0b9f\u0bbf\u0ba4\u0bcd\u0ba4\u0ba4\u0bc1 \u263a
expected output: written in that particular language(here tamil)
unicode-escape worked for me.
.decode('unicode-escape')

How can i get fully loaded html through python-mechanize?

Hi I'm using python mechanize to get datas from webpages.
I'm trying to get imgurl from google image search webpage to download search result images.
Here's my code
I fill search form as 'dog' and submit. (search 'dog')
import mechanize
import cookielib
import urllib2
import urllib
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time = 1)
br.addheaders = [('User-agent', 'Mozilla/5.0 (x11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'), ('Accept', '*/*') ,('Accept-Language', 'ko-KR')]
br.open('http://www.google.com/imghp?hl=en')
br.select_form(nr=0)
br.form['q'] = 'dog'
a = br.submit()
searched_url = br.geturl()
file0 = open("1.html", "wb")
file0.write(a.read())
file0.close()
when i see page-source from chrome browser, there are 'imgurl's in pagesource. But when i read data from
python mechanize, there's no such things.
also, the size of 1.html(which i write by python) is much smaller than html file downloaded from chrome.
How can i get exactly same html data as web-browsers by using python?
Do i have to set request headers same as web-browsers?
thanks

Simulate browser access to load all html elements

I am trying to load a youtube page and get the <embed> element as follows. However, the embed element cannot be found (soup.find('embed') returns None).
import urllib
import urllib2
from bs4 import BeautifulSoup
import mechanize
YT_URL = 'http://www.youtube.com/watch'
vidId = 'OuSdU8tbcHY'
br = mechanize.Browser()
# Browser options
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open('%s?v=%s' % (YT_URL, vidId))
soup = BeautifulSoup(br.response().read())
print soup.find('embed')
However, when I write the soup to an html file and load it in a browser it loads the <embed> element. Presumably this has something to do with the browser being different to mechanize and some kind of document.onload() magic?
How can I simulate the browser loading the page so that I can see the <embed> element?
The page uses js to load the content dymanically. Mechanize simply cannot handle it. You have two options here:
try to simulate those js calls manually in the script
switch to in-browser tools like selenium
Here's the same sample using selenium:
import selenium.webdriver as webdriver
url = "http://www.youtube.com/watch?v=OuSdU8tbcHY"
driver = webdriver.Firefox()
driver.get(url)
embed = driver.find_elements_by_tag_name('embed')[0]
print embed
Hope that helps.

Filling a form using a python script

I'm trying to write a python script that will fill a form on a website, send it, and after sending I want to search for a keyword on the resulting webpage.
More specifically, the form is: https://booking.elal.co.il/newBooking/changeOrder.jsp?LANG=EN&RESSYSTEMID=1
When I fill the form manually on the web, after I press the "continue" button I get kind of "processing page", and afterwards I get the webpage that I want to search on it the keyword.
I tried to use the script here: http://stockrt.github.io/p/handling-html-forms-with-python-mechanize-and-BeautifulSoup/ , but for some reason after submitting the form when I do: print br.response().geturl() I get the url of the "processing page", and not the url of the webpage I want to search on.
My Code:
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# The site we will navigate into, handling it's session
br.open('https://booking.elal.co.il/newBooking/changeOrder.jsp?LANG=EN&RESSYSTEMID=1')
# Select the first (index zero) form
br.select_form(nr=0)
# User credentials
br.form['REC_LOC'] = '...'
br.form['DIRECT_RETRIEVE_LASTNAME'] = '...'
# Login
br.submit()
#Trying to print the webpage
html = br.response().read()
print html2text.html2text(html)
Is it possible to do what I want, and how can I do it?

python mechanize session not saved

I'm trying to use python mechanize to retrive the list of apps on iTunes connect. Once this list is retrieved, further work will be done with those links.
Logging in succeeds but then when i follow the "Manage Your Applications" link I get redirected back to the login page. It is as if the session gets lost.
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
filename = 'itunes.html'
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open('https://itunesconnect.apple.com/WebObjects/iTunesConnect.woa')
br.select_form(name='appleConnectForm')
br.form['theAccountName'] = username
br.form['theAccountPW'] = password
br.submit()
apps_link = br.find_link(text='Manage Your Applications')
print "Manage Your Apps link = ", apps_link
req = br.follow_link(text='Manage Your Applications')
for app_link in br.links():
print "link is ", app_link
Any ideas what could be wrong?
You need to save/load the cookiejar
Figured this out after further investigation. This was due to a known bug in cookielib documented here: http://bugs.python.org/issue3924
Basically some sites (notably itunesconnect), set the cookie version as a string not an int. Which causes an error in cookielib since it does not deal with that condition. The fix at the bottom of that issue thread worked for me.

Categories