I can't seem to figure out how to get this script to read the actual documents within the links it pulls. I can't seem to get it to bring back in text from the actual document within the links. I also tried to use iframe and src but was unsuccessful.
I have never tried anything like this before so a little stumped on what else I can do.
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import urlparse, parse_qs
import io
from PyPDF2 import PdfReader
# specify the web driver path
driver = webdriver.Chrome("path/to/chromedriver")
# navigate to the website
url = "https://probaterecords.shelbyal.com/shelby/search.do?indexName=opr&templateName=Main&searchQuery=richard+wygle&lq=&searchType=1®ex=%5B%5Ea-zA-Z0-9_%5D®exwithspaces=%5B%5Ea-zA-Z0-9%5Cs_%5D®exwithasterisks=%5B%5E*a-zA-Z0-9%5Cs_%5D&sortBy=InstrumentFilter&desc=N&searchable=DisplayName%2CLastName%2CFirstName%2CInstrument%2CRecDate%2CPartyRole%2CDocTypeDesc%2CDocType%2CBook%2CPage%2CLot%2CBlock%2CTownship%2COther%2CFreeform%2COtherName&isPhoneticSearch=&q=richard+wygle&basicSortOrder=InstrumentFilter%7CN&Instrument=&Instrument_select=AND&RecDate=&RecDate=&RecDate_select=AND&LastName=&LastName_select=AND&searchkindLast=StartsLast&FirstName=&FirstName_select=OR&FirstName2=&FirstName2_select=AND&DocTypeDesc=&DocTypeDesc_select=AND&Book=&Book_select=AND&Page=&Page_select=AND&MAPBOOK=&MAPBOOK_select=AND&MAPPAGE=&MAPPAGE_select=AND&Lot%23=&Lot%23_select=AND&Lot=&Lot_select=AND&Block=&Block_select=AND&Section=&Section_select=AND&Township=&Township_select=AND&Range=&Range_select=AND&QT=&QT_select=AND&BQT=&BQT_select=AND&LegacyNum=&LegacyNum_select=AND&advancedSortOrder=InstrumentFilter%7CN"
driver.get(url)
# get the page source
html = driver.page_source
# parse the HTML
soup = BeautifulSoup(html, 'html.parser')
# find all the anchor tags with class "nocolor pphoto"
links = soup.select('a[class="nocolor pphoto"]')
# Create an empty dictionary to store the links
unique_links = {}
for link in links:
href = link['href']
if href.startswith('/shelby/search.do?indexName=shelbyimages&lq='):
# construct the full link
full_link = 'https://probaterecords.shelbyal.com' + href
# parse the query parameters from the link
parsed_url = urlparse(full_link)
query_params = parse_qs(parsed_url.query)
# extract the instrument number from the query parameters
instrument_number = query_params['lq'][0]
# Extract the document type
options = soup.select('select[name="DocTypeDesc"] option')
for option in options:
# check if the option value contains "deed"
if "deeds" in option.get_text().lower():
doc_type = option.get_text()
# add the link to the dictionary
unique_links[instrument_number] = (full_link, doc_type)
# Iterate over the unique links
for instrument_number, link_info in unique_links.items():
full_link, doc_type = link_info
# Open the PDF file from the url
response = requests.get(full_link)
pdf_file = io.BytesIO(response.content)
pdf_reader = PdfReader(pdf_file)
# Get the number of pages
pages = len(pdf_reader.pages)
# Initialize a variable to store the text
text = ""
# Iterate over the pages
for page in pdf_reader.pages:
# Extract the text from the page
text += page.extract_text()
# Print the document type, instrument number and the text
print("Document Type: ", doc_type)
print("Instrument Number: ", instrument_number)
print("Text: ", text)
Related
I'm having a bit of trouble trying to save the links from a website into a list without repeating urls with same domain
Example:
www.python.org/download and www.python.org/about
should only save the first one (www.python.org/download) and not repeat it later
This is what i've got so far
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
url = "https://docs.python.org/3/library/urllib.request.html#module-urllib.request"
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
atag = doc.find_all('a', href=True)
links = []
#below should be some kind of for loop
As a one-liner:
links = {nl for a in doc.find_all('a', href=True) if (nl := urlparse(a["href"]).netloc) != ""}
Explained:
links = set() # define empty set
for a in doc.find_all('a', href=True): # loop over every <a> element
nl = urlparse(a["href"]).netloc # get netloc from url
if nl:
links.add(nl) # add to set if exists
output:
{'www.w3.org', 'datatracker.ietf.org', 'www.python.org', 'requests.readthedocs.io', 'github.com', 'www.sphinx-doc.org'}
i am trying to scrape news from reuters but there is a click to view more at the bottom on the website. I could not know how to load the hidden results by using beautiful soup.
from bs4 import BeautifulSoup
import urllib.request
def scrape_reuters_news(ticker):
url = "https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob="+ticker
scraped_data = urllib.request.urlopen(url)
scraped_data = scraped_data.read()
parsed_articles = BeautifulSoup(scraped_data, 'lxml')
links = parsed_articles.find_all("h3")
articles = []
titles = []
title_class = "Text__text___3eVx1j Text__dark-grey___AS2I_p Text__medium___1ocDap Text__heading_2___sUlNJP Heading__base___1dDlXY Heading__heading_2___3f_bIW ArticleHeader__heading___3ibi0Q"
for link in links:
paragraphs = ""
url = "https://www.reuters.com/"+str(link)[41:63]
scraped_data = urllib.request.urlopen(url)
scraped_data = scraped_data.read()
parsed_article = BeautifulSoup(scraped_data, 'lxml')
article = parsed_article.find_all("p")
title = parsed_article.select("h1", {"class": title_class})
titles.append(title[0].text.strip())
for paragraph in article:
paragraphs += paragraph.text + " "
articles.append(paragraphs)
return titles, articles
# edit
ticker = "apple"
news = scrape_reuters_news(ticker)
When you click the load more a callback is issued that you can find in the network tab. If you grab the number of results from the search page, you can add this into the callback to get all results in one go. I then use regex to extract the id to reconstruct each detail page url and the title (headline)
You would then visit each link to get the paragraph info.
Please note:
There is some de-duplication work to do. There exist different ids which lead to same content. So perhaps exclude based on title?
You may need to consider whether any pre-processing of ticker needs to happen e.g. convert to lowercase, replace spaces with "-". I don't know all your use cases.
from bs4 import BeautifulSoup as bs
import requests, re
ticker = 'apple'
with requests.Session() as s:
r = s.get(f'https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob={ticker}')
soup = bs(r.content, 'lxml')
num_results = soup.select_one('.search-result-count-num').text
r = s.get(f'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob={ticker}&bigOrSmall=big&articleWithBlog=true&sortBy=relevance&dateRange=pastWeek&numResultsToShow={num_results}&pn=&callback=addMoreNewsResults')
p = re.compile(r'id: "(.*?)"')
p2 = re.compile(r'headline: "(.*?)"')
links = [f'https://www.reuters.com/article/id{i}' for i in p.findall(r.text)]
headlines = [bs(i, 'lxml').get_text() for i in p2.findall(r.text)]
print(len(links), len(headlines))
From the detail pages you can get the paragraphs with
paras = ' '.join([i.get_text() for i in soup.select('[data-testid*=paragraph-]')])
In the code below, I fill a form then submit it on a website. Then I scrape the resulting data and then writes it to to a csv file (All these work very well). But there is a on that result page with its text 'Later' please how can I click this link. I am using. I have checked a similar question: this but it doesn't quite answer my question.
# import needed libraries
from mechanize import Browser
from datetime import datetime
from bs4 import BeautifulSoup
import csv
br = Browser()
# Ignore robots.txt
br.set_handle_robots(False)
# Google demands a user-agent that isn't a robot
br.addheaders = [('User-agent', 'Chrome')]
# Retrieve the Google home page, saving the response
br.open('http://fahrplan.sbb.ch/bin/query.exe/en')
# Enter the text input (This section should be automated to read multiple text input as shown in the question)
br.select_form(nr=6)
br.form["REQ0JourneyStopsS0G"] = 'Eisenstadt' # Origin train station (From)
br.form["REQ0JourneyStopsZ0G"] ='sarajevo' # Destination train station (To)
br.form["REQ0JourneyTime"] = '5:30' # Search Time
br.form["date"] = '18.01.17' # Search Date
# Get the search results
br.submit()
# get the response from mechanize Browser
soup = BeautifulSoup(br.response().read(), 'html.parser', from_encoding="utf-8")
trs = soup.select('table.hfs_overview tr')
# scrape the contents of the table to csv (This is not complete as I cannot write the duration column to the csv)
with open('out.csv', 'w') as f:
for tr in trs:
locations = tr.select('td.location')
if len(locations) > 0:
location = locations[0].contents[0].strip()
prefix = tr.select('td.prefix')[0].contents[0].strip()
time = tr.select('td.time')[0].contents[0].strip()
#print tr.select('td.duration').contents[0].strip()
durations = tr.select('td.duration')
#print durations
if len(durations) == 0:
duration = ''
#print("oops! There aren't any durations.")
else:
duration = durations[0].contents[0].strip()
f.write("{},{},{}, {}\n".format(location.encode('utf-8'), prefix, time, duration))
The HTML with the Later link looks like
<a accesskey="l" class="hafas-browse-link" href="http://fahrplan.sbb.ch/bin/query.exe/en?ld=std2.a&seqnr=1&ident=kv.047469247.1487285405&REQ0HafasScrollDir=1" id="hfs_linkLater" title="Search for later connections">Later</a>
You can find the url using:
In [22]: soup.find('a', text='Later')['href']
Out[22]: u'http://fahrplan.sbb.ch/bin/query.exe/en?ld=std2.a&seqnr=1&ident=kv.047469247.1487285405&REQ0HafasScrollDir=1'
To make the browser go to that link call br.open:
In [21]: br.open(soup.find('a', text='Later')['href'])
Out[21]: <response_seek_wrapper at 0x7f346a5da320 whose wrapped object = <closeable_response at 0x7f3469bee830 whose fp = <socket._fileobject object at 0x7f34697f26d0>>>
I'm trying to scrape a list of URL's from the European Parliament's Legislative Observatory. I do not type in any search keyword in order to get all links to documents (currently 13172). I can easily scrape a list of the first 10 results which are displayed on the website using the code below. However, I want to have all links so that I would not need to somehow press the next page button. Please let me know if you know of a way to achieve this.
import requests, bs4, re
# main url of the Legislative Observatory's search site
url_main = 'http://www.europarl.europa.eu/oeil/search/search.do?searchTab=y'
# function gets a list of links to the procedures
def links_to_procedures (url_main):
# requesting html code from the main search site of the Legislative Observatory
response = requests.get(url_main)
soup = bs4.BeautifulSoup(response.text) # loading text into Beautiful Soup
links = [a.attrs.get('href') for a in soup.select('div.procedure_title a')] # getting a list of links of the procedure title
return links
print(links_to_procedures(url_main))
You can follow the pagination by specifying the page GET parameter.
First, get the results count, then calculate the number of pages to process by dividing the count on the results count per page. Then, iterate over pages one by one and collect the links:
import re
from bs4 import BeautifulSoup
import requests
response = requests.get('http://www.europarl.europa.eu/oeil/search/search.do?searchTab=y')
soup = BeautifulSoup(response.content)
# get the results count
num_results = soup.find('span', class_=re.compile('resultNum')).text
num_results = int(re.search('(\d+)', num_results).group(1))
print "Results found: " + str(num_results)
results_per_page = 50
base_url = "http://www.europarl.europa.eu/oeil/search/result.do?page={page}&rows=%s&sort=d&searchTab=y&sortTab=y&x=1411566719001" % results_per_page
links = []
for page in xrange(1, num_results/results_per_page + 1):
print "Current page: " + str(page)
url = base_url.format(page=page)
response = requests.get(url)
soup = BeautifulSoup(response.content)
links += [a.attrs.get('href') for a in soup.select('div.procedure_title a')]
print links
So I wanted to get all of the pictures on this page(of the nba teams).
http://www.cbssports.com/nba/draft/mock-draft
However, my code gives a bit more than that. It gives me,
<img src="http://sports.cbsimg.net/images/nba/logos/30x30/ORL.png" alt="Orlando Magic" width="30" height="30" border="0" />
How can I shorten it to only give me, http://sports.cbsimg.net/images/nba/logos/30x30/ORL.png.
My code:
import urllib2
from BeautifulSoup import BeautifulSoup
# or if your're using BeautifulSoup4:
# from bs4 import BeautifulSoup
soup = BeautifulSoup(urllib2.urlopen('http://www.cbssports.com/nba/draft/mock-draft').read())
rows = soup.findAll("table", attrs = {'class': 'data borderTop'})[0].tbody.findAll("tr")[2:]
for row in rows:
fields = row.findAll("td")
if len(fields) >= 3:
anchor = row.findAll("td")[1].find("a")
if anchor:
print anchor
I know this can be "traumatic", but for those automatically generated pages, where you just want to grab the damn images away and never come back, a quick-n-dirty regular expression that takes the desired pattern tends to be my choice (no Beautiful Soup dependency is a great advantage):
import urllib, re
source = urllib.urlopen('http://www.cbssports.com/nba/draft/mock-draft').read()
## every image name is an abbreviation composed by capital letters, so...
for link in re.findall('http://sports.cbsimg.net/images/nba/logos/30x30/[A-Z]*.png', source):
print link
## the code above just prints the link;
## if you want to actually download, set the flag below to True
actually_download = False
if actually_download:
filename = link.split('/')[-1]
urllib.urlretrieve(link, filename)
Hope this helps!
To save all the images on http://www.cbssports.com/nba/draft/mock-draft,
import urllib2
import os
from BeautifulSoup import BeautifulSoup
URL = "http://www.cbssports.com/nba/draft/mock-draft"
default_dir = os.path.join(os.path.expanduser("~"),"Pictures")
opener = urllib2.build_opener()
urllib2.install_opener(opener)
soup = BeautifulSoup(urllib2.urlopen(URL).read())
imgs = soup.findAll("img",{"alt":True, "src":True})
for img in imgs:
img_url = img["src"]
filename = os.path.join(default_dir, img_url.split("/")[-1])
img_data = opener.open(img_url)
f = open(filename,"wb")
f.write(img_data.read())
f.close()
To save any particular image on http://www.cbssports.com/nba/draft/mock-draft,
use
soup.find("img",{"src":"image_name_from_source"})
You can use this functions for getting the list of all images url from url.
#
#
# get_url_images_in_text()
#
# #param html - the html to extract urls of images from him.
# #param protocol - the protocol of the website, for append to urls that not start with protocol.
#
# #return list of imags url.
#
#
def get_url_images_in_text(html, protocol):
urls = []
all_urls = re.findall(r'((http\:|https\:)?\/\/[^"\' ]*?\.(png|jpg))', html, flags=re.IGNORECASE | re.MULTILINE | re.UNICODE)
for url in all_urls:
if not url[0].startswith("http"):
urls.append(protocol + url[0])
else:
urls.append(url[0])
return urls
#
#
# get_images_from_url()
#
# #param url - the url for extract images url from him.
#
# #return list of images url.
#
#
def get_images_from_url(url):
protocol = url.split('/')[0]
resp = requests.get(url)
return get_url_images_in_text(resp.text, protocol)