How to scrape embedded links and tabular information - python

I'm trying to scrape information about the datasets available on this website.
I want to collect the URLs to the resources and at least the title of the dataset.
Using this resource as an example, I want to capture the URL embedded in "Go to resource" and the title listed in the table:
I have created a basic scraper, but it doesn't seem work:
import requests
import csv
from bs4 import BeautifulSoup
site = requests.get('https://data.nsw.gov.au/data/dataset');
data_list=[]
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
internals = content.select('.resource-url-analytics')
for url in internals:
title = internals.select=('.resource-url-analytics')[0].get_text()
link = internals.select=('.resource-url-analytics')[0].get('href')
new_data = {"title": title, "link": link}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["dataset", "link"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
I would like to write the output to a CSV with columns for the URLs and the titles.
This is an example of the desired output
Greatly appreciative for any assistance

Have a look at the API for the datasets that will likely be the easiest way to do this.
In the meantime, here is how you can get the API links at id level from those pages and store the entire package info for all packages in one list, data_sets, and just the info of interest in another variable (results). Be sure to review the API documentation in case there is a better method - for example, it would be nice if ids could be submitted in batches rather than per id.
Answer below is taking advantage of the endpoint detailed in the documentation which is used to get a full JSON representation of a dataset, resource or other object
Taking the current first result on landing page of:
Vegetation of the Guyra 1:25000 map sheet VIS_ID 240.
We want the last child a of parent h3 with a parent having class .dataset-item. In the below, the spaces between selectors are descendant combinators.
.dataset-item h3 a:last-child
You can shorten this to h3 a:last-child for a small efficiency gain.
This relationship reliably selects all relevant links on page.
Continuing with this example, visiting that retrieved url for first listed item, we can find the id using api endpoint (which retrieves json related to this package), via an attribute=value selector with contains, *, operator. We know this particular api endpoint has a common string so we substring match on the href attribute value:
[href*="/api/3/action/package_show?id="]
The domain can vary and some retrieved links are relative so we have to test if relative and add the appropriate domain.
First page html for that match:
Notes:
data_sets is a list containing all the package data for each package and is extensive. I did this in case you are interest in looking at what is in those packages (besides reviewing the API documentation)
You can get total number of pages from soup object on a page via
num_pages = int(soup.select('[href^="/data/dataset?page="]')[-2].text)
You can alter the loop for less pages.
Session object is used for efficiency of re-using connection. I'm sure there are other improvements to be made. In particular I would look for any method which reduced the number of requests (why I mentioned looking for a batch id endpoint for example).
There can be none to more than one resource url within a returned package. See example here. You can edit code to handle this.
Python:
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
for page in range(1,2): #you decide how many pages to loop
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
All pages
(very long running so consider threading/asyncio):
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
r = s.get('https://data.nsw.gov.au/data/dataset')
soup = bs(r.content, 'lxml')
num_pages = int(soup.select('[href^="/data/dataset?page="]')[-2].text)
links = [item['href'] for item in soup.select('.dataset-item h3 a:last-child')]
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
if num_pages > 1:
for page in range(1, num_pages + 1): #you decide how many pages to loop
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)

For simplicity use selenium package:
from selenium import webdriver
import os
# initialise browser
browser = webdriver.Chrome(os.getcwd() + '/chromedriver')
browser.get('https://data.nsw.gov.au/data/dataset')
# find all elements by xpath
get_elements = browser.find_elements_by_xpath('//*[#id="content"]/div/div/section/div/ul/li/div/h3/a[2]')
# collect data
data = []
for item in get_elements:
data.append((item.text, item.get_attribute('href')))
Output:
('Vegetation of the Guyra 1:25000 map sheet VIS_ID 240', 'https://datasets.seed.nsw.gov.au/dataset/vegetation-of-the-guyra-1-25000-map-sheet-vis_id-2401ee52')
('State Vegetation Type Map: Riverina Region Version v1.2 - VIS_ID 4469', 'https://datasets.seed.nsw.gov.au/dataset/riverina-regional-native-vegetation-map-version-v1-0-vis_id-4449')
('Temperate Highland Peat Swamps on Sandstone (THPSS) spatial distribution maps...', 'https://datasets.seed.nsw.gov.au/dataset/temperate-highland-peat-swamps-on-sandstone-thpss-vegetation-maps-vis-ids-4480-to-4485')
('Environmental Planning Instrument - Flood', 'https://www.planningportal.nsw.gov.au/opendata/dataset/epi-flood')
and so on

Related

how to scrape data on a website with view more with beautifulsoup

i am trying to scrape news from reuters but there is a click to view more at the bottom on the website. I could not know how to load the hidden results by using beautiful soup.
from bs4 import BeautifulSoup
import urllib.request
def scrape_reuters_news(ticker):
url = "https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob="+ticker
scraped_data = urllib.request.urlopen(url)
scraped_data = scraped_data.read()
parsed_articles = BeautifulSoup(scraped_data, 'lxml')
links = parsed_articles.find_all("h3")
articles = []
titles = []
title_class = "Text__text___3eVx1j Text__dark-grey___AS2I_p Text__medium___1ocDap Text__heading_2___sUlNJP Heading__base___1dDlXY Heading__heading_2___3f_bIW ArticleHeader__heading___3ibi0Q"
for link in links:
paragraphs = ""
url = "https://www.reuters.com/"+str(link)[41:63]
scraped_data = urllib.request.urlopen(url)
scraped_data = scraped_data.read()
parsed_article = BeautifulSoup(scraped_data, 'lxml')
article = parsed_article.find_all("p")
title = parsed_article.select("h1", {"class": title_class})
titles.append(title[0].text.strip())
for paragraph in article:
paragraphs += paragraph.text + " "
articles.append(paragraphs)
return titles, articles
# edit
ticker = "apple"
news = scrape_reuters_news(ticker)
When you click the load more a callback is issued that you can find in the network tab. If you grab the number of results from the search page, you can add this into the callback to get all results in one go. I then use regex to extract the id to reconstruct each detail page url and the title (headline)
You would then visit each link to get the paragraph info.
Please note:
There is some de-duplication work to do. There exist different ids which lead to same content. So perhaps exclude based on title?
You may need to consider whether any pre-processing of ticker needs to happen e.g. convert to lowercase, replace spaces with "-". I don't know all your use cases.
from bs4 import BeautifulSoup as bs
import requests, re
ticker = 'apple'
with requests.Session() as s:
r = s.get(f'https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob={ticker}')
soup = bs(r.content, 'lxml')
num_results = soup.select_one('.search-result-count-num').text
r = s.get(f'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob={ticker}&bigOrSmall=big&articleWithBlog=true&sortBy=relevance&dateRange=pastWeek&numResultsToShow={num_results}&pn=&callback=addMoreNewsResults')
p = re.compile(r'id: "(.*?)"')
p2 = re.compile(r'headline: "(.*?)"')
links = [f'https://www.reuters.com/article/id{i}' for i in p.findall(r.text)]
headlines = [bs(i, 'lxml').get_text() for i in p2.findall(r.text)]
print(len(links), len(headlines))
From the detail pages you can get the paragraphs with
paras = ' '.join([i.get_text() for i in soup.select('[data-testid*=paragraph-]')])

Scraping website with BS4 // accessing class

I am tring to extract different information from websites with BeautifulSoup, such as title of the product and the price.
I do that with different urls, looping through the urls with for...in.... Here, I'll just provide a snippet without the loop.
from bs4 import BeautifulSoup
import requests
import csv
url= 'https://www.mediamarkt.ch/fr/product/_lg-oled65gx6la-1991479.html'
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
price = soup.find('meta', property="product:price:amount")
title = soup.find("div", {"class": "flix-model-name"})
title2 = soup.find('div', class_="flix-model-name")
title3 = soup.find("div", attrs={"class": "flix-model-name"})
print(price['content'])
print(title)
print(title2)
print(title3)
So from this URL https://www.mediamarkt.ch/fr/product/_lg-oled65gx6la-1991479.html I wasnt to extract the product number. the only place I find it is in the div class="flix-model-name". However, I am totally unable to reach it. I tried different ways to access it in the title, title2, title3, but I always have the output none.
I am a bit of a beginner, so I guess I am probably missing something basic... If so, please pardon me for that.
Any help is welcome! Many thanks in advance!
just for info, with each url I thought of appending the data and write them on a CSV file like that:
for url in urls:
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
row=[]
try:
# title = YOUR VERY WELCOMED ANSWER
prices = soup.find('meta', property="product:price:amount")
row = (title.text+','+prices['content']+'\n')
data.append(row)
except:
pass
file = open('database.csv','w')
i = 0
while i < (len(data)):
file.write(data[i])
i +=1
file.close()
Many thanks in advance for your help!
David
Try below approach using python - requests simple, straightforward, reliable, fast and less code is required when it comes to requests. I have fetched the API URL from website itself after inspecting the network section of google chrome browser.
What exactly below script is doing:
First it will take the API URL, create the URL based on 2 dynamic parameters(product and category) and then do GET request to get the data.
After getting the data script will parse the JSON data using json.loads library.
Finally, it will iterate all over the list of products one by one and print the details which are divided in 2 categotries 'box1_ProductToProduct' and 'box2_KategorieTopseller' like Brand, Name, Product number and Unit price. Same way you can add more details by looking in to the API call.
import json
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def scrap_product_details():
PRODUCT = 'MMCH1991479' #Product number
CATEGORY = '680942' #Category number
URL = 'https://www.mediamarkt.ch/rde_server/res/MMCH/recomm/product_detail/sid/WACXyEbIf3khlu6FcHlh1B1?product=' + PRODUCT + '&category=' + CATEGORY # dynamic URL
response = requests.get(URL,verify = False) #GET request to fetch the data
result = json.loads(response.text) # Parse JSON data using json.loads
box1_ProductToProduct = result[0]['box1_ProductToProduct'] # Extracted data from API
box2_KategorieTopseller = result[1]['box2_KategorieTopseller']
for item in box1_ProductToProduct: # loop over extracted data
print('-' * 100)
print('Brand : ',item['brand'])
print('Name : ',item['name'])
print('Net Unit Price : ',item['netUnitPrice'])
print('Product Number : ',item['product_nr'])
print('-' * 100)
for item in box2_KategorieTopseller: # loop over extracted data
print('-' * 100)
print('Brand : ',item['brand'])
print('Name : ',item['name'])
print('Net Unit Price : ',item['netUnitPrice'])
print('Product Number : ',item['product_nr'])
print('-' * 100)
scrap_product_details()

Can't parse different product links from a webpage

I've created a script in Python to fetch different product links from a webpage. Although I know the content of that site are dynamic, I tried conventional way to let you inform that I tried. I looked for APIs in the dev tools but could not find one. Ain't there any way to get those links using requests?
Site Link
I've written so far:
import requests
from bs4 import BeautifulSoup
link = "https://www.amazon.com/stores/node/10699640011"
def fetch_product_links(url):
res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
for item_link in soup.select("[id^='ProductGrid-'] li[class^='style__itemOuter__'] > a"):
print(item_link.get("href"))
if __name__ == '__main__':
fetch_product_links(link)
How can I fetch different product links from that site using requests?
I think you only need the asins which you can collect from another url construct you can see in network tab i.e. you can significantly shorten the final urls. You do however need to make a request to your original url to pick up an identifier to use in second url. Returns 146 links.
import requests, re, json
node = '10699640011'
with requests.Session() as s:
r = s.get(f'https://www.amazon.com/stores/node/{node}')
p = re.compile(r'var slotsStr = "\[(.*?,){3} share\]";')
identifier = p.findall(r.text)[0]
identifier = identifier.strip()[:-1]
r = s.get(f'https://www.amazon.com/stores/slot/{identifier}?node={node}')
p = re.compile(r'var config = (.*?);')
data = json.loads(p.findall(r.text)[0])
asins = data['content']['ASINList']
links = [f'https://www.amazon.com/dp/{asin}' for asin in asins]
print(links)
EDIT:
With two given nodes:
import requests, re, json
from bs4 import BeautifulSoup as bs
nodes = ['3039806011','10699640011']
with requests.Session() as s:
for node in nodes:
r = s.get(f'https://www.amazon.com/stores/node/{node}')
soup = bs(r.content, 'lxml')
identifier = soup.select('.stores-widget-btf:not([id=share],[id*=RECOMMENDATION])')[-1]['id']
r = s.get(f'https://www.amazon.com/stores/slot/{identifier}?node={node}')
p = re.compile(r'var config = (.*?);')
data = json.loads(p.findall(r.text)[0])
asins = data['content']['ASINList']
links = [f'https://www.amazon.com/dp/{asin}' for asin in asins]
print(links)

How can I loop scraping data for multiple pages in a website using python and beautifulsoup4

I am trying to scrape data from the PGA.com website to get a table of all of the golf courses in the United States. In my CSV table I want to include the Name of the golf course ,Address ,Ownership ,Website , Phone number. With this data I would like to geocode it and place into a map and have a local copy on my computer
I utilized Python and Beautiful Soup4 to extract my data. I have reached as far to extract the data and import it into a CSV but I am now having a problem of scraping data from multiple pages on the PGA website. I want to extract ALL THE GOLF COURSES but my script is limited only to one page I want to loop it in away that it will capture all data for golf courses from all pages found in the PGA site. There are about 18000 gold courses and 900 pages to capture data
Attached below is my script. I need help on creating code that will capture ALL data from the PGA website and not just one site but multiple. In this manner it will provide me with all the data of gold courses in the United States.
Here is my script below:
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data1=soup.find_all("div",{"class":"views-field-nothing-1"})
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('filename5.csv','wb') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
#for item in g_data1:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-counter"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-course-type"})[0].text
#except:
#pass
#for item in g_data2:
#try:
#print item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
#except:
#pass
#try:
#print item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
#except:
#pass
This script only captures 20 at a time and I want to capture all in one script which account for 18000 golf courses and 900 pages to scrape form.
The PGA website's search have multiple pages, the url follows the pattern:
http://www.pga.com/golf-courses/search?page=1 # Additional info after page parameter here
this means you can read the content of the page, then change the value of page by 1, and read the the next page.... and so on.
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
# Your code for each individual page here
if you still read this post , you can try this code too....
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "Details.csv"
f = open(file, "w")
Headers = "Name,Address,City,Phone,Website\n"
f.write(Headers)
for page in range(1,5):
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(page)
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")
Title = soup.find_all("div", {"class":"views-field-nothing"})
for i in Title:
try:
name = i.find("div", {"class":"views-field-title"}).get_text()
address = i.find("div", {"class":"views-field-address"}).get_text()
city = i.find("div", {"class":"views-field-city-state-zip"}).get_text()
phone = i.find("div", {"class":"views-field-work-phone"}).get_text()
website = i.find("div", {"class":"views-field-website"}).get_text()
print(name, address, city, phone, website)
f.write("{}".format(name).replace(",","|")+ ",{}".format(address)+ ",{}".format(city).replace(",", " ")+ ",{}".format(phone) + ",{}".format(website) + "\n")
except: AttributeError
f.close()
where it is written range(1,5) just change that with 0,to the last page , and you will get all details in CSV, i tried very hard to get your data in proper format but it's hard:).
You're putting a link to a single page, it's not going to iterate through each one on its own.
Page 1:
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
Page 2:
http://www.pga.com/golf-courses/search?page=1&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Page 907:
http://www.pga.com/golf-courses/search?page=906&searchbox=Course%20Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0
Since you're running for page 1 you'll only get 20. You'll need to create a loop that'll run through each page.
You can start off by creating a function that does one page then iterate that function.
Right after the search? in the url, starting at page 2, page=1 begins increasing until page 907 where it's page=906.
I noticed that the first solution had a repetition of the first instance, that is because the 0 page and 1 page is the same page. This is resolved by specifying the start page in the range function. Example below...
for i in range(1, 907): #Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib") #Can use whichever parser you prefer
# Your code for each individual page here
Had this same exact problem and the solutions above did not work. I solved mine by accounting for cookies. A requests session helps. Create a session and it'll pull all the pages you need by inserting a cookie to all the numbered pages.
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.pga.com/golf-courses/search?searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0"
s = requests.Session()
r = s.get(url)
The PGA website has changed this question has been asked.
It seems they organize all courses by: State > City > Course
In light of this change and the popularity of this question, here's how I'd solve this problem today.
Step 1 - Import everything we'll need:
import time
import random
from gazpacho import Soup # https://github.com/maxhumber/gazpacho
from tqdm import tqdm # to keep track of progress
Step 2 - Scrape all the state URL endpoints:
URL = "https://www.pga.com"
def get_state_urls():
soup = Soup.get(URL + "/play")
a_tags = soup.find("ul", {"data-cy": "states"}, mode="first").find("a")
state_urls = [URL + a.attrs['href'] for a in a_tags]
return state_urls
state_urls = get_state_urls()
Step 3 - Write a function to scrape all the city links:
def get_state_cities(state_url):
soup = Soup.get(state_url)
a_tags = soup.find("ul", {"data-cy": "city-list"}).find("a")
state_cities = [URL + a.attrs['href'] for a in a_tags]
return state_cities
state_url = state_urls[0]
city_links = get_state_cities(state_url)
Step 4 - Write a function to scrape all of the courses:
def get_courses(city_link):
soup = Soup.get(city_link)
courses = soup.find("div", {"class": "MuiGrid-root MuiGrid-item MuiGrid-grid-xs-12 MuiGrid-grid-md-6"}, mode="all")
return courses
city_link = city_links[0]
courses = get_courses(city_link)
Step 5 - Write a function to parse all the useful info about a course:
def parse_course(course):
return {
"name": course.find("h5", mode="first").text,
"address": course.find("div", {'class': "jss332"}, mode="first").strip(),
"url": course.find("a", mode="first").attrs["href"]
}
course = courses[0]
parse_course(course)
Step 6 - Loop through everything and save:
all_courses = []
for state_url in tqdm(state_urls):
city_links = get_state_cities(state_url)
time.sleep(random.uniform(1, 10) / 10)
for city_link in city_links:
courses = get_courses(city_link)
time.sleep(random.uniform(1, 10) / 10)
for course in courses:
info = parse_course(course)
all_courses.append(info)

Display all search results when web scraping with Python

I'm trying to scrape a list of URL's from the European Parliament's Legislative Observatory. I do not type in any search keyword in order to get all links to documents (currently 13172). I can easily scrape a list of the first 10 results which are displayed on the website using the code below. However, I want to have all links so that I would not need to somehow press the next page button. Please let me know if you know of a way to achieve this.
import requests, bs4, re
# main url of the Legislative Observatory's search site
url_main = 'http://www.europarl.europa.eu/oeil/search/search.do?searchTab=y'
# function gets a list of links to the procedures
def links_to_procedures (url_main):
# requesting html code from the main search site of the Legislative Observatory
response = requests.get(url_main)
soup = bs4.BeautifulSoup(response.text) # loading text into Beautiful Soup
links = [a.attrs.get('href') for a in soup.select('div.procedure_title a')] # getting a list of links of the procedure title
return links
print(links_to_procedures(url_main))
You can follow the pagination by specifying the page GET parameter.
First, get the results count, then calculate the number of pages to process by dividing the count on the results count per page. Then, iterate over pages one by one and collect the links:
import re
from bs4 import BeautifulSoup
import requests
response = requests.get('http://www.europarl.europa.eu/oeil/search/search.do?searchTab=y')
soup = BeautifulSoup(response.content)
# get the results count
num_results = soup.find('span', class_=re.compile('resultNum')).text
num_results = int(re.search('(\d+)', num_results).group(1))
print "Results found: " + str(num_results)
results_per_page = 50
base_url = "http://www.europarl.europa.eu/oeil/search/result.do?page={page}&rows=%s&sort=d&searchTab=y&sortTab=y&x=1411566719001" % results_per_page
links = []
for page in xrange(1, num_results/results_per_page + 1):
print "Current page: " + str(page)
url = base_url.format(page=page)
response = requests.get(url)
soup = BeautifulSoup(response.content)
links += [a.attrs.get('href') for a in soup.select('div.procedure_title a')]
print links

Categories