Retrieve Mechanical Soup results after submitting a form - python

I am struggling to retrieve some results from a simple form submission. This is what I have so far:
import mechanicalsoup
browser = mechanicalsoup.StatefulBrowser()
browser.set_verbose(2)
url = "https://www.dermcoll.edu.au/find-a-derm/"
browser.open(url)
form = browser.select_form("#find-derm-form")
browser["postcode"] = 3000
browser.submit_selected()
form.print_summary()
Where do these results end up...?
Many thanks

As per the MechanicalSoup FAQ, you shouldn't use this library when dealing with a dynamic JavaScript-enabled form, which seems to be the case for the website in your example.
Instead, you can use Selenium in combination with BeautifulSoup (and a little bit of help from webdriver-manager) to achieve your desired result. A short example would look like this:
from selenium import webdriver
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
# set up the Chrome driver instance using webdriver_manager
driver = webdriver.Chrome(ChromeDriverManager().install())
# navigate to the page
driver.get("https://www.dermcoll.edu.au/find-a-derm/")
# find the postcode input and enter your desired value
postcode_input = driver.find_element_by_name("postcode")
postcode_input.send_keys("3000")
# find the search button and perform the search
search_button = driver.find_element_by_class_name("search-btn.location_derm_search_icon")
search_button.click()
# get all search results and load them into a BeautifulSoup object for parsing
search_results = driver.find_element_by_id("search_result")
search_results = search_results.get_attribute('innerHTML')
search_results = BeautifulSoup(search_results)
# get individual result cards
search_results = search_results.find_all("div", {"class": "address_sec_contents"})
# now you can parse for whatever information you need
[x.find("h4") for x in search_results] # names
[x.find("p", {"class": "qualification"}) for x in search_results] # qualifications
[x.find("address") for x in search_results] # addresses
While this way may seem more involved, it's a lot more robust and can be easily repurposed for many more situations where MechanicalSoup falls short.

Related

How to scrape news articles from cnbc with keyword "Green hydrogen"?

I am trying to scrap news article listed in this url, all article are in span.Card-title. But this gives blank output. Is there any to resolve this?
from bs4 import BeautifulSoup as soup
import requests
cnbc_url = "https://www.cnbc.com/search/?query=green%20hydrogen&qsearchterm=green%20hydrogen"
html = requests.get(cnbc_url)
bsobj = soup(html.content,'html.parser')
day = bsobj.find(id="root")
print(day.find_all('span',class_='Card-title'))
for link in bsobj.find_all('span',class_='Card-title'):
print('Headlines : {}'.format(link.text))
The problem is that content is not present on page when it loads initially, only afterwards is it fetched from server using url like this
https://api.queryly.com/cnbc/json.aspx?queryly_key=31a35d40a9a64ab3&query=green%20hydrogen&endindex=0&batchsize=10&callback=&showfaceted=false&timezoneoffset=-240&facetedfields=formats&facetedkey=formats%7C&facetedvalue=!Press%20Release%7C&needtoptickers=1&additionalindexes=4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28
and added to page.
Take a look at /json.aspx endpoint in devtools, data seems to be there.
As mentioned in another answer, the data about the articles are loaded using another link, which you can find via the networks tab in devtools. [In chrome, you can open devtools with Ctrl+Shift+I, then go to the networks tab to see requests made, and then click on the name starting with 'json.aspx?...' to see details, then copy the Request URL from Headers section.]
Once you have the Request URL, you can copy it and make the request in your code to get the data:
# dataReqUrl contains the copied Request URL
dataReq = requests.get(dataReqUrl)
for r in dataReq.json()['results']: print(r['cn:title'])
If you don't feel like trying to find that one request in 250+ other requests, you might also try to assemble a shorter form of the url with something like:
# import urllib.parse
# find link to js file with api key
jsLinks = bsobj.select('link[href][rel="preload"]')
jUrl = [m.get('href') for m in jsLinks if 'main' in m.get('href')][0]
jRes = requests.get(jUrl) # request js file api key
# get api key from javascript
qKey = jRes.text.replace(' ', '').split(
'QUERYLY_KEY:'
)[-1].split(',')[0].replace('"', '').strip()
# form url
qParams = {
'queryly_key': qKey,
'query': search_for, # = 'green hydrogen'
'batchsize': 10 # can go up to 100 apparently
}
qUrlParams = urllib.parse.urlencode(qParams, quote_via=urllib.parse.quote)
dataReqUrl = f'https://api.queryly.com/cnbc/json.aspx?{qUrlParams}'
Even though the assembled dataReqUrl is not identical to the copied one, it seems to be giving the same results (I checked with a few different search terms). However, I don't know how reliable this method is, especially compared to the much less convoluted approach with selenium:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# define chromeDriver_path <-- where you saved 'chromedriver.exe'
cnbc_url = "https://www.cnbc.com/search/?query=green%20hydrogen&qsearchterm=green%20hydrogen"
driver = webdriver.Chrome(chromeDriver_path)
driver.get(cnbc_url)
ctSelector = 'span.Card-title'
WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located(
(By.CSS_SELECTOR, ctSelector)))
cardTitles = driver.find_elements(By.CSS_SELECTOR, ctSelector)
cardTitles_text = [ct.get_attribute('innerText') for ct in cardTitles]
for c in cardTitles_text: print(c)
In my opinion, this approach is more reliable as well as simpler.

Web scraping when scrolling down is needed

I want to scrape, e.g., the title of the first 200 questions under the web page https://www.quora.com/topic/Stack-Overflow-4/all_questions. And I tried the following code:
import requests
from bs4 import BeautifulSoup
url = "https://www.quora.com/topic/Stack-Overflow-4/all_questions"
print("url")
print(url)
r = requests.get(url) # HTTP request
print("r")
print(r)
html_doc = r.text # Extracts the html
print("html_doc")
print(html_doc)
soup = BeautifulSoup(html_doc, 'lxml') # Create a BeautifulSoup object
print("soup")
print(soup)
It gave me a text https://pastebin.com/9dSPzAyX. If we search href='/, we can see that the html does contain title of some questions. However, the problem is that the number is not enough; actually on the web page, a user needs to manually scroll down to trigger extra load.
Does anyone know how I could mimic "scrolling down" by the program to load more content of the page?
Infinite scrolls on a webpage is based on the Javascript functionality. Therefore, to find out what URL we need to access and what parameters to use, we need to either thoroughly study the JS code working inside the page or, and preferably, examine the requests that the browser does when you scroll down the page. We can study requests using the Developer Tools.
See example for quora
the more you scroll down, the more requests generated. so now your requests will be done to that url instead of normal url but keep in mind to send correct headers and playload.
other easier solution will be by using selenium
Couldn't find a response using request. But you can use Selenium. First printed out the number of questions at first load, then send the End key to mimic scrolling down. You can see number of questions went from 20 to 40 after sending the End key.
I used driver.implicitly wait for 5 seconds before loading the DOM again in case the script load to fast before the DOM was loaded. You can improve by using EC with selenium.
The page loads 20 questions per scroll. So if you are looking to scrape 100 questions, then you need to send the End key 5 times.
To use the code below you need to install chromedriver.
http://chromedriver.chromium.org/downloads
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
CHROMEDRIVER_PATH = ""
CHROME_PATH = ""
WINDOW_SIZE = "1920,1080"
chrome_options = Options()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)
chrome_options.binary_location = CHROME_PATH
prefs = {'profile.managed_default_content_settings.images':2}
chrome_options.add_experimental_option("prefs", prefs)
url = "https://www.quora.com/topic/Stack-Overflow-4/all_questions"
def scrape(url, times):
if not url.startswith('http'):
raise Exception('URLs need to start with "http"')
driver = webdriver.Chrome(
executable_path=CHROMEDRIVER_PATH,
chrome_options=chrome_options
)
driver.get(url)
counter = 1
while counter <= times:
q_list = driver.find_element_by_class_name('TopicAllQuestionsList')
questions = [x for x in q_list.find_elements_by_xpath('//div[#class="pagedlist_item"]')]
q_len = len(questions)
print(q_len)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
wait = WebDriverWait(driver, 5)
time.sleep(5)
questions2 = [x for x in q_list.find_elements_by_xpath('//div[#class="pagedlist_item"]')]
print(len(questions2))
counter += 1
driver.close()
if __name__ == '__main__':
scrape(url, 5)
I recommend using selenium rather than bs.
selenium can control browser and parsing. like scroll down, click button, etc…
this example is for scroll down for get all liker user in instagram.
https://stackoverflow.com/a/54882356/5611675
If the content only loads on "scrolling down", this probably means that the page is using Javascript to dynamically load the content.
You can try using a web client such as PhantomJS to load the page and execute the javascript in it, and simulate the scroll by injecting some JS such as document.body.scrollTop = sY; (Simulate scroll event using Javascript).

Python Web Scraping with search and non dynamic URI

I'm a begginer in the world of python and web-scrapers, i am used to make scrapers with dynamic URLs, where the URI change when i input specific parameters in the URL itself.
Ex: Wikipedia.
(if i input a search named "Stack Overflow" i will have a URI that looks like this: https://en.wikipedia.org/wiki/Stack_Overflow)
At the moment i was challenged to develop a web-scraper to collect data from this page.
The field "Texto/Termos a serem pesquisados" corresponds a search field, but when i input the search the URL stays the same not letting me to get the right HTML code for my research.
I am used to work with BeautifulSoup and Requests to do the scraping thing, but in this case it has no use, since the URL stays the same after the search.
import requests
from bs4 import BeautifulSoup
url = 'http://comprasnet.gov.br/acesso.asp?url=/ConsultaLicitacoes/ConsLicitacao_texto.asp'
html = requests.get(url)
bs0bj = BeautifulSoup(html.content,'html.parser')
print(bsObj)
# And from now on i cant go any further
Usually i would do something like
url = 'https://en.wikipedia.org/wiki/'
input = input('Input your search :)
search = url + input
And then do all the BeautifulSoup thing, and findAll thing to get my data from the HTML code.
I have tried to use Selenium too, but im looking for something different than that due to all the webdriver thing. With the following piece of code i have achieved to some odd results but i still cant scrape the HTML in a good way.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
# Acess the page and input the search on the field
driver = webdriver.Chrome()
driver.get('http://comprasnet.gov.br/acesso.asp?url=/ConsultaLicitacoes/ConsLicitacao_texto.asp')
driver.switch_to.frame('main2')
busca = driver.find_element_by_id("txtTermo")
busca.send_keys("GESTAO DE PESSOAS")
#data_inicio = driver.find_element_by_id('dt_publ_ini')
#data_inicio.send_keys("01/01/2018")
#data_fim = driver.find_element_by_id('dt_publ_fim')
#data_fim.send_keys('20/12/2018')
botao = driver.find_element_by_id('ok')
botao.click()
So given all that:
There is a way to scrape data from these static urls ?
Can i input a search in the field via code ?
Why cant i scrape the right source code ?
The problem is that your initial search page is using frames for the searching & results, which makes it harder for BeautifulSoup to work with it. I was able to obtain the search results by using a slightly different URL and MechanicalSoup instead:
>>> from mechanicalsoup import StatefulBrowser
>>> sb = StatefulBrowser()
>>> sb.open('http://comprasnet.gov.br/ConsultaLicitacoes/ConsLicitacao_texto.asp')
<Response [200]>
>>> sb.select_form() # select the search form
<mechanicalsoup.form.Form object at 0x7f2c10b1bc18>
>>> sb['txtTermo'] = 'search text' # input the text to search for
>>> sb.submit_selected() # submit the form
<Response [200]>
>>> page = sb.get_current_page() # get the returned page in BeautifulSoup form
>>> type(page)
<class 'bs4.BeautifulSoup'>
Note that the URL I'm using here is that of the frame that has the search form and not the page you provided that was inlining it. This removes one layer of indirection.
MechanicalSoup is built on top of BeautifulSoup and provides some tools for interacting with websites in a similar way to the old mechanize library.

Using Python to enter data into form and get data from results page

I am using Python 2.7 on a 32 bit Windows machine.
I am trying to enter species data into http://explorer.natureserve.org and retrieve the results, but am having difficulty understanding how to do it. Needless to say I am relatively new to Python.
I have the following code:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
driver.get("http://explorer.natureserve.org")
assert "NatureServe" in driver.title
SciName = driver.find_element_by_name('searchSciOrCommonName')
SciName.send_keys("Arabis georgiana")
SciName.send_keys(Keys.RETURN)
assert "No results found." not in driver.page_source
The above works, but now I need to select the element Arabis georgiana on the results page, which will take me to another page. How do I get the results page back into Python and redirect to the page that I actually want?
You need to set the searchSciOrCommonName field value this way:
br.form = list(br.forms())[0]
br.form['searchSciOrCommonName'] = 'butterfly'
response = br.submit()
Then, you can parse the HTML response via, for example, BeautifulSoup:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response)
for item in soup.select('table[border="1"] > tr i')[1:]:
print(item.text.strip())
which would print:
Aglais io
Callophrys mossii hidakupa
Callophrys mossii marinensis
Cercyonis pegala incana
...
Psora nipponica
Flowering Plants
Asclepias tuberosa

Item visible in browser not collected by scraper

I'm trying to collect data from the SumofUs website; specifically the number of signatures on the petition. The datum is presented like this: <div class="percent">256,485 </div> (this is the only item of this class on the Page.)
So I tried this:
import requests
from bs4 import BeautifulSoup
user_agent = {'User-agent': 'Mozilla/5.0'}
url = 'http://action.sumofus.org/a/nhs-patient-corporations/'
raw = requests.get(url, headers = user_agent)
html = BeautifulSoup(raw.text)
# get the item we're seeking
number = html.find("div", class_="percent")
print number
It seems that the number isn't rendered (I've tried a couple of user agent strings.) What else could be causing this? How can I work around this in future?
In the general case you should use a headless browser. Ghost.py is written in python so its probably a good choice to try first.
In this specific case a little research reveals that there's a much simpler method. By using the network tab in chrome you can see that the site makes an ajax call to populate the value. So you can just get it directly:
url = "http://action.sumofus.org/api/ak_action_count_by_action/?action=nhs-patient-corporations&additional="
number = int(requests.get(url).text)
You could use Selenium:
from selenium import webdriver
from bs4 import BeautifulSoup
url = 'http://action.sumofus.org/a/nhs-patient-corporations/'
driver = webdriver.Firefox()
driver.get(url)
driver.set_window_position(0, 0)
driver.set_window_size(100000, 200000)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5) # wait to load
# then load BeautifulSoup with browsers content
html = BeautifulSoup(driver.page_source)
...

Categories