Selenium does not load <li> inside <ul> inside <div> - python

I am new to Selenium, Python, and programming in general but I am trying to write a small web scraper. I have encountered a website that has multiple links but their HTML code is not available for me using
soup = bs4.BeautifulSoup(html, "lxml")
The HTML-Code is:
<div class="content">
<div class="vertical_page_list is-detailed">
<div infinite-nodes="true" up-data="{"next":1,"url":"/de/pressemitteilungen?container_contenxt=lg%2C1.0"}">[event]
<ul class="has-no-bottom-margin list-unstyled infinite-nodes--list">
<li class="vertical-page-list--item is-detailed infite-nodes--list-item" style="display: list-item;">
<li class="...>
...
</ul>
</div>
</div>
</div>
But soup only contains this part, missing the li classes:
<div class="content">
<div class="vertical_page_list is-detailed">
<div infinite-nodes="true" up-data="{"next":1,"url":"/de/pressemitteilungen?container_contenxt=lg%2C1.0"}">
<ul class="has-no-bottom-margin list-unstyled infinite-nodes--list">
</ul>
</div>
</div>
</div>
It has somthing to do with the [event] after the div but I can't figure out what to do. My guess was that it is some lazy-loaded code but using
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
or directly moving to the element
actions = ActionChains(driver)
actions.move_to_element(driver.find_element_by_xpath("//div['infinite-nodes=']")).perform()
did not yield any results. This is the code I am using:
# Enable headless firefox for Serenium
options = Options()
#options.headless = True
options.add_argument("--headless")
options.page_load_strategy = 'normal'
driver = webdriver.Firefox(options=options, executable_path=r'C:\bin\geckodriver.exe')
print ("Headless Firefox Initialized")
# Load html source code from webpage
driver = webdriver.PhantomJS(executable_path=r'C:\phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get("https://www.volkswagen-newsroom.com/de/pressemitteilungen?container_context=lg%2C1.0")
SCROLL_PAUSE_TIME = 2
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
print("Scrolled down to bottom")
# Extract html code
driver.find_element_by_xpath("//div['infinite-nodes=']").click() #just testing
time.sleep(SCROLL_PAUSE_TIME)
html = driver.page_source.encode('utf-8')
soup = bs4.BeautifulSoup(html, "lxml")
Could anyone help me please?

When you visit the page in a browser, and log your network traffic, every time the page loads (or you press the Mehr Pressemitteilungen anzeigen button) an XHR (XmlHttpRequest) request is made to some kind of API(?) - the response of which is JSON, which also contains HTML. It's this HTML that contains the list-item elements you're looking for. You don't need selenium for this:
def get_article_titles():
import requests
from bs4 import BeautifulSoup as Soup
url = "https://www.volkswagen-newsroom.com/de/pressemitteilungen"
params = {
"container_context": "lg,1.0",
"next": "1"
}
headers = {
"accept": "application/json",
"accept-encoding": "gzip, deflate",
"user-agent": "Mozilla/5.0",
"x-requested-with": "XMLHttpRequest"
}
while True:
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
params["next"] = data["next"]
soup = Soup(data["html"], "html.parser")
for tag in soup.select("h3.page-preview--title > a"):
yield tag.get_text().strip()
def main():
from itertools import islice
for num, title in enumerate(islice(get_article_titles(), 10), start=1):
print("{}.) {}".format(num, title))
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Output:
1.) Volkswagen Konzern, BASF, Daimler AG und Fairphone starten Partnerschaft für nachhaltigen Lithiumabbau in Chile
2.) Verkehrsausschuss-Vorsitzender Cem Özdemir informiert sich über Transformation im Elektro-Werk in Zwickau
3.) Astypalea: Start der Transformation zur smarten, nachhaltigen Insel
4.) Vor 60 Jahren: Fußball-Legende Pelé zu Besuch im Volkswagen Werk Wolfsburg
5.) Novum unter den Kompakten: Neuer Polo ist mit „IQ.DRIVE Travel Assist“ teilautomatisiert unterwegs
6.) Der neue Tiguan Allspace – ab sofort bestellbar
7.) Volkswagen startet Vertriebsoffensive im deutschen Markt
8.) Vor 70 Jahren: Volkswagen erhält ersten Beirat
9.) „Experience our Volkswagen Way to Zero“ – neue Ausstellung im DRIVE. Volkswagen Group Forum für Gäste geöffnet
10.) Jetzt bestellbar: Der neue ID.4 GTX
>>>

Related

Insert a code, click on a button and extract the result with Scrapy

I state that I have never used Scrapy (and therefore I do not even know if it is the right tool).
On the website https://www.ufficiocamerale.it/, I am interested in entering an 11-digit numeric code (for example 06655971007) in the bar "INSERISCI LA PARTITA IVA/RAGIONE SOCIALE" and then click on "CERCA". Then I would like to save the resulting HTML in a variable that I would later analyze with BeautifulSoup (I shouldn't have any problems with that).
So, how can I do the first part?
I imagine something like:
import scrapy
class Extraction(scrapy.Spider):
def start_requests(self):
url = "https://www.ufficiocamerale.it/"
# To enter data
yield scrapy.FormRequest(url=url, formdata={...}, callback=self.parse)
# To click the button
# some code
def parse(self, response):
print(response.body)
These are the HTML of the search bar and the button:
<input type="search" name="search_input" class="autocomplete form-control" onchange="if (!window.__cfRLUnblockHandlers) return false; checkPartitaIva()" onkeyup="if (!window.__cfRLUnblockHandlers) return false; checkPartitaIva()" id="search_input" placeholder=" " value="">
<button onclick="if (!window.__cfRLUnblockHandlers) return false; dataLayer.push({'event': 'trova azienda'});" type="submit" class="btn btn-primary btn-sm text-uppercase">Cerca</button>
It uses JavaScript to generate some elements so it would be simpler to use Selenium
from selenium import webdriver
import time
url = 'https://www.ufficiocamerale.it/'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(5) # JavaScript needs time to load code
item = driver.find_element_by_xpath('//form[#id="formRicercaAzienda"]//input[#id="search_input"]')
#item = driver.find_element_by_id('search_input')
item.send_keys('06655971007')
time.sleep(1)
button = driver.find_element_by_xpath('//form[#id="formRicercaAzienda"]//p//button[#type="submit"]')
button.click()
time.sleep(5) # JavaScript needs time to load code
item = driver.find_element_by_tag_name('h1')
print(item.text)
print('---')
all_items = driver.find_elements_by_xpath('//ul[#id="first-group"]/li')
for item in all_items:
if '#' in item.text:
print(item.text, '<<< found email:', item.text.split(' ')[1])
else:
print(item.text)
print('---')
Result:
DATI DELLA SOCIETÀ - ENEL ENERGIA S.P.A.
---
Partita IVA: 06655971007 - Codice Fiscale: 06655971007
Rag. Sociale: ENEL ENERGIA S.P.A.
Indirizzo: VIALE REGINA MARGHERITA 125 - 00198 - ROMA
Rea: 1150724
PEC: enelenergia#pec.enel.it <<< found email: enelenergia#pec.enel.it
Fatturato: € 13.032.695.000,00 (2020)
ACQUISTA BILANCIO
Dipendenti : 1666 (2021)
---

Scraping nested html with Selenium

I'm looking for some help with scraping with selenium in python.
You need a paid account to view this page so creating a reproducible won't be possible.
The page I'm trying to scrape
I'm attempting to scrape the data from the pitch in the top right corner of the image under 'Spots on Field'.
<div class="player-details-football-map__UEFA player-details-football-map">
<div class="shots">
<div>
<a class="shot episode" style="left: 39.8529%; top: 28.9474%;"></a>
<div class="tooltip" style="left: 39.8529%; top: 28.9474%;">
<div class="tooltip-title">
<div class="tooltip-shoot-type">Shot on target</div>
<div class="tooltip-blow-type">Donyell Malen </div>
<div class="tooltip-shoot-name"></div>
</div>
<div class="tooltip-time">h Viktoria Koln</div>
<div class="tooltip-time">Half 1, 18:22 02/09/20</div>
<div class="tooltip-time">Length: 7.1 m</div>
<div class="tooltip-shoot-xg">Expected goals: 0.17</div>
</div>
</div>
The above is a snippet of just one of the data points I want to scrape.
I've tried using BeautifulSoup
from bs4 import BeautifulSoup
from requests import get
url = 'https://football.instatscout.com/players/294322/shots'
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
shots = html_soup.find_all('div', class_ = 'tooltip')
print(type(shots))
print(len(shots))
and nothing was being returned.
So now I've tried using Selenium.
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Users\James\OneDrive\Desktop\webdriver\chromedriver.exe')
driver.get('https://football.instatscout.com/players/294322/shots')
print("Page Title is : %s" %driver.title)
driver.find_element_by_name('email').send_keys('my username')
driver.find_element_by_name('pass').send_keys('my password')
driver.find_element_by_xpath('//*[contains(concat( " ", #class, " " ), concat( " ", "hRAqIl", " " ))]').click()
goals = driver.find_element_by_class_name('tooltip')
but I'm getting the error of
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".tooltip"}
Can someone please help point me in the right direction? I'm basically trying to scrape everything from the above HTML, that includes 'tooltip' in the class name.
Thanks
Using css selectors with bs4:
from bs4 import BeautifulSoup as soup
import re #for extracting offsets
r = [{**dict(zip(['left', 'top'], re.findall('[\d\.]+', i.div['style']))),
'shoot_type':i.select_one('.tooltip-shoot-type').text,
'name':i.select_one('.tooltip-blow-type').text,
'team':i.select_one('div:nth-of-type(2).tooltip-time').text,
'time':i.select_one('div:nth-of-type(3).tooltip-time').text,
'length':i.select_one('div:nth-of-type(4).tooltip-time').text[8:],
'expected_goals':i.select_one('.tooltip-shoot-xg').text[16:]}
for i in soup(html, 'html.parser').select('div.shots > div')]
Output:
[{'left': '39.8529', 'top': '28.9474', 'shoot_type': 'Shot on target', 'name': 'Donyell Malen ', 'team': 'h Viktoria Koln', 'time': 'Half 1, 18:22 02/09/20', 'length': '7.1 m', 'expected_goals': '0.17'}]

BeautifulSoup can't find list elements given class

I am trying to access the elements in the Ingredients list of the following website: https://www.jamieoliver.com/recipes/pasta-recipes/gennaro-s-classic-spaghetti-carbonara/
<div class="col-md-12 ingredient-wrapper">
<ul class="ingred-list ">
<li>
3 large free-range egg yolks
</li>
<li>
40 g Parmesan cheese, plus extra to serve
</li>
<li>
1 x 150 g piece of higher-welfare pancetta
</li>
<li>
200g dried spaghetti
</li>
<li>
1 clove of garlic
</li>
<li>
extra virgin olive oil
</li>
</ul>
</div
I first tried just using requests and beautiful soup but my code didn't find the list elements. I then tried using Selenium and it still didn't work. My code is below:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.jamieoliver.com/recipes/pasta-recipes/cracker-ravioli/"
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
for ultag in soup.findAll('div', {'class': "col-md-12 ingredient-wrapper"}):
# for ultag in soup.findAll('ul', {'class': 'ingred_list '}):
for litag in ultag.findALL('li'):
print(litag.text)
To get the ingredients list, you can use this example:
import requests
from bs4 import BeautifulSoup
url = 'https://www.jamieoliver.com/recipes/pasta-recipes/gennaro-s-classic-spaghetti-carbonara/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
for li in soup.select('.ingred-list li'):
print(' '.join(li.text.split()))
Prints:
3 large free-range egg yolks
40 g Parmesan cheese , plus extra to serve
1 x 150 g piece of higher-welfare pancetta
200 g dried spaghetti
1 clove of garlic
extra virgin olive oil

Navigating the DOM tree with BeautifulSoup

I'm scraping a website for prices of listings, and am not figuring out how to navigate the tree structure.
In the best of worlds I would have a for loop to iterate over all the lis and do some data analysis, hence I would love to have an iterator iterate over the specific elements that are nested way down.
I tried to call nested elements à la .div.div. I think I'm just new to this, some lines of help would be greatly appreciated!
myurl = 'https://www.2ememain.be/l/velos-velomoteurs/q/velo/'
uClient = uReq(myurl)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "lxml")
containers = page_soup.findAll(
"li", {"class": "mp-Listing mp-Listing--list-item"})
Here is the tree structure:
<li class="mp-Listing mp-Listing--list-item">
<figure class="mp-Listing-image-container"><a
data-tracking="mucLxVHX8FbvYBHPHfGkOCRq9VFszDlhSxgIClJUJRXbTYMnnOw8kI1NFuitzMperXfQZoyyS2Mx8VbGSZB7_jITV8iJZErGmgWsWp4Arvmpog9Hw3EO8q45U-6chavRHHXbOGPOeNci_683vlir1_SAK-XDa7Znjl22XHOxxH_n3QwloxZSRCxAKGjVYg8aQGTfUgZd2b9DDBdUR2fqyUEUXqnMGZ5hjKlTKTR67obF26tTc8kc1HAsv_fvTEfJW-UxpJCuVhXjKi3pcuL99F8QesdivVy1p_jhs7KL-528jJXZ-LGNSz6cloZlO3yEsAdN_NxI4vz76mTfPY-fiRuAlSPfcjP8KYuDw9e8Qz-QyhUNfhIzOZyU6r1suEfcihY9w_HYY-Qn6vmZ8Bw9ZZn4CEV7odI4_7RzYe8OBw4UmTXAODFxJgS-7fnlWgUAZqX8wu_WydbQLqDqpMXEMsbzKFxaerTLhhUGBqNlBEzpJ0jBIm7-hafuMH5v3IRU0Iha8fUbu7soVLYTuTcbBG2dUgEH-O2-bALjnkMB8XWlICCM14klxeRyOAFscVKg2m6p5aanRR38dgEXuvVE9UcSjHW43JeNSv3gJ7GwJww"
href="/a/velos-velomoteurs/velos-ancetres-oldtimers/a34926285-peugeot-velo-de-course-1970.html?c=17f70af2bde4a155c6d568ce3cad9ab7&previousPage=lr">
<div class="mp-Listing-image-item mp-Listing-image-item--main"
style="background-image:url(//i.ebayimg.com/00/s/NTI1WDcwMA==/z/LlYAAOSw3Rdc-miZ/$_82.JPG)"><img
alt="Peugeot - V�lo de course - 1970" data-img-src="Peugeot - V�lo de course - 1970"
src="//i.ebayimg.com/00/s/NTI1WDcwMA==/z/LlYAAOSw3Rdc-miZ/$_82.JPG"
title="Peugeot - V�lo de course - 1970" /></div>
</a></figure>
<div class="mp-Listing-content">
<div class="mp-Listing-group mp-Listing-group--main">
<h3 class="mp-Listing-title"><a
data-tracking="mucLxVHX8FbvYBHPHfGkOCRq9VFszDlhSxgIClJUJRXbTYMnnOw8kI1NFuitzMperXfQZoyyS2Mx8VbGSZB7_jITV8iJZErGmgWsWp4Arvmpog9Hw3EO8q45U-6chavRHHXbOGPOeNci_683vlir1_SAK-XDa7Znjl22XHOxxH_n3QwloxZSRCxAKGjVYg8aQGTfUgZd2b9DDBdUR2fqyUEUXqnMGZ5hjKlTKTR67obF26tTc8kc1HAsv_fvTEfJW-UxpJCuVhXjKi3pcuL99F8QesdivVy1p_jhs7KL-528jJXZ-LGNSz6cloZlO3yEsAdN_NxI4vz76mTfPY-fiRuAlSPfcjP8KYuDw9e8Qz-QyhUNfhIzOZyU6r1suEfcihY9w_HYY-Qn6vmZ8Bw9ZZn4CEV7odI4_7RzYe8OBw4UmTXAODFxJgS-7fnlWgUAZqX8wu_WydbQLqDqpMXEMsbzKFxaerTLhhUGBqNlBEzpJ0jBIm7-hafuMH5v3IRU0Iha8fUbu7soVLYTuTcbBG2dUgEH-O2-bALjnkMB8XWlICCM14klxeRyOAFscVKg2m6p5aanRR38dgEXuvVE9UcSjHW43JeNSv3gJ7GwJww"
href="/a/velos-velomoteurs/velos-ancetres-oldtimers/a34926285-peugeot-velo-de-course-1970.html?c=17f70af2bde4a155c6d568ce3cad9ab7&previousPage=lr">Peugeot
- V�lo de course - 1970</a></h3>
<p class="mp-Listing-description mp-text-paragraph">Cet objet est vendu par Catawiki. Cliquez sur le lien
pour �tre redirig� vers le site Catawiki et placer votre ench�re.v�lo de cou<span><input
class="mp-Listing-show-more" id="a34926285" type="checkbox" /><span
class="mp-Listing-description mp-Listing-description--extended">rse peugeot des ann�es 70,
�quip� de pneus neufs (michelin dynamic sport), freins Mafac racer, d�railleur allvit, 3
plateaux, 21 vitesses.selle Basano</span><label for="a34926285">...<span
class="mp-Icon mp-Icon--xs mp-svg-arrow-down"></span><span
class="mp-Icon mp-Icon--xs mp-svg-arrow-up"></span></label></span></p>
<div class="mp-Listing-attributes"></div>
</div>
<div class="mp-Listing-group mp-Listing-group--aside">
<div class="mp-Listing-group mp-Listing-group--top-block"><span
class="mp-Listing-price mp-text-price-label">Voir description</span><span
class="mp-Listing-seller-name"><a class="mp-TextLink"
href="/u/catawiki/38096837/">Catawiki</a></span><span
class="mp-Listing-date">Aujourd'hui</span><span class="mp-Listing-location">Toute la
Belgique<br /></span></div>
<div class="mp-Listing-group mp-Listing-group--bottom-block"><span class="mp-Listing-priority">Annonce au
top</span><span class="mp-Listing-seller-link"><a class="mp-TextLink undefined"
href="https://admarkt.2dehands.be/buyside/url/RK-f5Gyr8TS9VKWPn06TDHk8zCWeSU5-PsQDuvr5tYpoRXQYzjmhI4E8OX9dXcZb0TEQOFSDMueu3s5kqHSihdgWdlYIhSdweDBq0ckhYm7kU8NzKSx7FWvKA8-ZSJUz6PW439SHCTDUa2er4_kqge-fyr8zJemRXzISpFdvVIzVufagipJY-9jozmgnesM_bfBJxR6r0IvKWR8GYnfgv0bPsg1Ny5CQMsw4LsI33lUP_g6cYuGIcGOeEupRpJtf1sXv11G7BTj3gZAo5fvVk35hdfr5LVSJxJYsDUOxS7pdcFtkVO-0EEbZwLG3FlDYaPqLnComuKbmrSwzIW6EwfWXvr1lvifS5cOPflPSsVE319HKQ06w2vk4-4N9-E-cSXye9Yj_YHhNCJdEynvHV0XWkMkdLE_flG421UIIHVbDZdKHV429Ka7HQQSdpbyU6nQ94UsVzRfi2gEgXM18WuI96qkT8oFtqZwGrrE4wlyLuDJnPWkzaYmEwsSoPslrkv_mY66yEOLYsLolpTF3aTRU3sqv0GvZwnPkR04uZJY8GeL70uz3XaP5mYPxKz-pmCFbnJN_i9oiA_LjEIrEzSmvCEM_jViUfPB4FIib7VEi_gag5qWNYYxfkIyT4mC9Y0EKx0JbNHzyBs1062ETCiFvtPaAgconmyqW2ztnw4it_D10qAEemDppNOXKMmX_Jg-feuFKwq-MdIxiyJK3yoiKPXzMEEBa2WXqchDAPF52YmcVjq8HDORqYFkq5-iLumz6Y8ut-smKs_-vMG7k52nO3RW3RzuO0syMLBlZGiqUnADJtj0hmGmzqHXRqflq4QCTEE2vmG2flfMSIz9XJ7ECg73CP5OSNPg5VlzWfCVgd7o1TYd-rFBFXWM5Xz-ZlCA03LOZtP3BeQR3-TnSL6MNWo46vEtHq5ntcF-TrFTl4h01C5DNF_7R4W36CqQ4"
rel="noopener noreferrer nofollow" target="_blank">Visiter le site internet</a></span></div>
</div>
</div>
</li>
The idea is to fetch
<span class="mp-Listing-seller-name"><a class="mp-TextLink">
through referencing. Like containers.div.span....
I believe this is what you're looking for:
from bs4 import BeautifulSoup as bs
target = [your code above - note that it's missing the opening <li>]
page_soup = bs(target, "lxml")
containers = page_soup.find_all('li')
for container in containers:
item = container.find_all("span", class_= "mp-Listing-seller-name")
print(item)
Output:
[<span class="mp-Listing-seller-name"><a class="mp-TextLink" href="/u/catawiki/38096837/">Catawiki</a></span>]

scraping a div with confirmation popup

I am trying to scrape a file in this site.
https://data.gov.in/catalog/complete-towns-directory-indiastatedistrictsub-district-level-census-2011
I am looking to download excelsheet with complete directory of towns of TRIPURA. first one in grid list.
my code is :
import requests
import selenium
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36'}
response = session.get(URL)
soup = BeautifulSoup(response.content, 'html.parser')
soup
And the corresponding element to get our file is given below. how to actually download that particular excel. it will direct to another window where the purpose has to be given and email address. It would be great if you could provide solution to this.
<div class="view-content">
<div class="views-row views-row-1 views-row-odd views-row-first ogpl-grid-list">
<div class="views-field views-field-title"> <span class="field-content"><span class="title-content">Complete Town Directory by India/State/District/Sub-District Level, Census 2011 - TRIPURA</span></span> </div>
<div class="views-field views-field-field-short-name confirmation-popup-177303 download-confirmation-box file-container excel"> <div class="field-content"><a class="177303 data-extension excel" href="https://data.gov.in/resources/complete-town-directory-indiastatedistrictsub-district-level-census-2011-tripura" target="_blank" title="excel (Open in new window)">excel</a></div> </div>
<div class="views-field views-field-dms-allowed-operations-3 visual-access"> <span class="field-content">Visual Access: NA</span> </div>
<div class="views-field views-field-field-granularity"> <span class="views-label views-label-field-granularity">Granularity: </span> <div class="field-content">Decadal</div> </div>
<div class="views-field views-field-nothing-1 download-file"> <span class="field-content"><span class="download-filesize">File Size: 44.5 KB</span></span> </div>
<div class="views-field views-field-field-file-download-count"> <span class="field-content download-counts"> Download: 529</span> </div>
<div class="views-field views-field-field-reference-url"> <span class="views-label views-label-field-reference-url">Reference URL: </span> <div class="field-content">http://www.censusindia.gov.in/2011census...</div> </div>
<div class="views-field views-field-dms-allowed-operations-1 vote_request_data_api"> <span class="field-content"><a class="api-link" href="https://data.gov.in/resources/complete-town-directory-indiastatedistrictsub-district-level-census-2011-tripura/api" title="View API">Data API</a></span> </div>
<div class="views-field views-field-field-note"> <span class="views-label views-label-field-note">Note: </span> <div class="field-content ogpl-more">NA</div> </div>
<div class="views-field views-field-dms-allowed-operations confirmationpopup-177303 data-export-cont"> <span class="views-label views-label-dms-allowed-operations">EXPORT IN: </span> <span class="field-content"><ul></ul></span> </div> </div>
When you click on the excel link it opens the following page :
https://data.gov.in/node/ID/download
It seems that the ID is the name of the first class of the link eg t.find('a')['class'][0]. Maybe there is a more concise method to get the id but it works as is using the classname
Then the page https://data.gov.in/node/ID/download redirects to the final URL (of the file).
The following is gathering all the URL in a list :
import requests
from bs4 import BeautifulSoup
URL = 'https://data.gov.in/catalog/complete-towns-directory-indiastatedistrictsub-district-level-census-2011'
src = requests.get(URL)
soup = BeautifulSoup(src.content, 'html.parser')
node_list = [
t.find('a')['class'][0]
for t in soup.findAll("div", { "class" : "excel" })
]
url_list = []
for url in node_list:
node = requests.get("https://data.gov.in/node/{0}/download".format(url))
soup = BeautifulSoup(node.content, 'html.parser')
content = soup.find_all("meta")[1]["content"].split("=")[1]
url_list.append(content)
print(url_list)
Complete code that downloads the files using default filename (using this post) :
import requests
from bs4 import BeautifulSoup
import urllib2
import shutil
import urlparse
import os
def download(url, fileName=None):
def getFileName(url,openUrl):
if 'Content-Disposition' in openUrl.info():
# If the response has Content-Disposition, try to get filename from it
cd = dict(map(
lambda x: x.strip().split('=') if '=' in x else (x.strip(),''),
openUrl.info()['Content-Disposition'].split(';')))
if 'filename' in cd:
filename = cd['filename'].strip("\"'")
if filename: return filename
# if no filename was found above, parse it out of the final URL.
return os.path.basename(urlparse.urlsplit(openUrl.url)[2])
r = urllib2.urlopen(urllib2.Request(url))
try:
fileName = fileName or getFileName(url,r)
with open(fileName, 'wb') as f:
shutil.copyfileobj(r,f)
finally:
r.close()
URL = 'https://data.gov.in/catalog/complete-towns-directory-indiastatedistrictsub-district-level-census-2011'
src = requests.get(URL)
soup = BeautifulSoup(src.content, 'html.parser')
node_list = [
t.find('a')['class'][0]
for t in soup.findAll("div", { "class" : "excel" })
]
url_list = []
for url in node_list:
node = requests.get("https://data.gov.in/node/{0}/download".format(url))
soup = BeautifulSoup(node.content, 'html.parser')
content = soup.find_all("meta")[1]["content"].split("=")[1]
url_list.append(content)
print("download : " + content)
download(content)

Categories