I am trying to scrape the info from the election results in 18 NI constituencies here:
http://www.eoni.org.uk/Elections/Election-results-and-statistics/Election-results-and-statistics-2003-onwards/Elections-2019/UK-Parliamentary-Election-2019-Results
Each of the unique URLs starts like this:
http://www.eoni.org.uk/Elections/Election-results-and-statistics/Election-results-and-statistics-2003-onwards/Elections-2019/
The selector for the 18 URLS is as follows:
#container > div.two-column-content.clearfix > div > div.right-column.cms > div > ul > li
What I want to start with is a list with the 18 URLS. This list should be clean (i.e. just have the actual addresses, no tags, etc)
My Code so far:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
from selenium import webdriver
url = 'http://www.eoni.org.uk/Elections/Election-results-and-statistics/Election-results-and-statistics-2003-onwards/Elections-2019/UK-Parliamentary-Election-2019-Results'
response = requests.get(url)
response.status_code
text = requests.get(url).text
soup = BeautifulSoup(text, parser="html5lib")
link_list = []
for a in soup('a'):
if a.has_attr('href'):
link_list.append(a)
re_pattern = r"^/Elections/Election-results-and-statistics/Election-results-and-statistics-2003-onwards/Elections-2019/"
This is where I get lost, as I need to search for all 18 URLS that start with that pattern (The pattern is wrong I am pretty sure. Please help!)
The rest of the code:
import re
good_urls = [url for url in link_list if re.match(re_pattern, url)]
here I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-36-f3fbbd3199b1> in <module>
----> 1 good_urls = [url for url in link_list if re.match(re_pattern, url)]
<ipython-input-36-f3fbbd3199b1> in <listcomp>(.0)
----> 1 good_urls = [url for url in link_list if re.match(re_pattern, url)]
~/opt/anaconda3/lib/python3.7/re.py in match(pattern, string, flags)
173 """Try to apply the pattern at the start of the string, returning
174 a Match object, or None if no match was found."""
--> 175 return _compile(pattern, flags).match(string)
176
177 def fullmatch(pattern, string, flags=0):
TypeError: expected string or bytes-like object
What should I type differently to get those 18 urls? Thank you!
This seems to do the job.
I've removed some unnecessary imports and stuff that's not needed here, just readd them if you need them elsewhere of course.
The error message was due to triyng to do a regex comparison on a soup object, it needs to be cast to string (same problem as discussed in the link #Huzefa posted, so that was definitely relevant).
Fixing that still left the issue of trying to isolate the correct strings. I've simplified the regex for matching, then use a simple string split on " and selecting the second object resulting from the split (which is our url)
import requests
from bs4 import BeautifulSoup
import re
url = 'http://www.eoni.org.uk/Elections/Election-results-and-statistics/Election-results-and-statistics-2003-onwards/Elections-2019/UK-Parliamentary-Election-2019-Results'
response = requests.get(url)
text = requests.get(url).text
soup = BeautifulSoup(text, "html.parser")
re_pattern = "<a href=\".*/Elections-2019/.*"
link_list = []
for a in soup('a'):
if a.has_attr('href') and re.match(re_pattern, str(a)):
link_list.append((str(a).split('"')[1]))
Hope it fits your purpose, ask if anything is unclear.
Related
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = "https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
data = requests.get(url)
soup = bs(data.content,"html.parser")
The code below are a test with to get 1 item.
property_overview = soup.find(class_="p24_regularListing").find(class_="p24_propertyOverview").find(class_='p24_propertyOverviewRow').find(class_='col-xs-6 p24_propertyOverviewKey').text
property_overview
Output : 'Listing Number'
The code below is what we have to get all the col-xs-6 p24_propertyOverviewKey
p24_regularListing_items = soup.find_all(class_="p24_regularListing")
for p24_propertyOverview_item in p24_regularListing_items:
p24_propertyOverview_items = p24_propertyOverview_item.find_all(class_="p24_propertyOverview")
for p24_propertyOverviewRow_item in p24_propertyOverview_items:
p24_propertyOverviewRow_items = p24_propertyOverviewRow_item.find_all(class_="p24_propertyOverviewRow")
for p24_propertyOverviewKey_item in p24_propertyOverviewRow_items:
p24_propertyOverviewKey_items = p24_propertyOverviewKey_item.find_all(class_="col-xs-6 p24_propertyOverviewKey")
p24_propertyOverviewKey_items
The code above only outputs 1 item. and not all
To put things more simply, you can use soup.select() (and via the comments, you can then use .get_text() to extract the text from each tag).
from bs4 import BeautifulSoup
import requests
resp = requests.get(
"https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
texts = []
for tag in soup.select(
# NB: this selector uses Python's implicit string concatenation
# to split it onto several lines.
".p24_regularListing "
".p24_propertyOverview "
".p24_propertyOverviewRow "
".p24_propertyOverviewKey"
):
texts.append(tag.get_text())
print(texts)
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import requests
url = 'https://en.wikisource.org/wiki/Main_Page'
r = requests.get(url)
Soup = BeautifulSoup(r.text, "html5lib")
List = Soup.find("div",class_="enws-mainpage-widget-content", id="enws-mainpage-newtexts-content").find_all('a')
ebooks=[]
i=0
for ebook in List:
x=ebook.get('title')
for ch in x:
if(ch==":"):
x=""
if x!="":
ebooks.append(x)
i=i+1
print("Please select a book: ")
inputnumber=0
while inputnumber<len(ebooks):
print(inputnumber+1, " - ", ebooks[inputnumber])
inputnumber=inputnumber+1
input=int(input())
selectedbook = Soup.find("href", title=ebooks[input-1])
print(selectedbook)
I want to get the href of whichever was selected by user but as output I get: None
Can someone please tell me where I am doing wrong
I changed the last two lines of your code, and added these
selectedbook = Soup.find("a", title=ebooks[input-1])
print(selectedbook['title'])
print("https://en.wikisource.org/"+selectedbook['href'])
This just works !.
NB: The find() method searches for the first tag with the specified name and returns an object of type bs4.element.Tag.
I would like to retrieve the url's of a web page recursively and get the result in a list.
This is the code I'm using:
catalog_url = "http://nomads.ncep.noaa.gov:9090/dods/gfs_0p25/"
from bs4 import BeautifulSoup # conda install -c asmeurer beautiful-soup=4.3.2
import urllib2
from datetime import datetime
html_page = urllib2.urlopen(catalog_url)
soup = BeautifulSoup(html_page)
urls_day = []
for link in soup.findAll('a'):
if datetime.today().strftime('%Y') in link.get('href'): # String contains today's year in name
print link.get('href')
urls_day.append(link.get('href'))
urls_final = []
for run in urls_day:
html_page2 = urllib2.urlopen(run)
soup2 = BeautifulSoup(html_page2)
for links in soup2.findAll('a'):
if datetime.today().strftime('%Y') in soup2.get('a'):
print links.get('href')
urls_final.append(links.get('href'))
In the first loop I get the url's in the catalog_url. urls_day is a list object with the url's that contain the string of the current year in it.
The second loop fails with the following output:
GrADS Data Server
Traceback (most recent call last):
File "<stdin>", line 6, in <module>
TypeError: argument of type 'NoneType' is not iterable
urls_final should be the list object containing the url's of my interest.
Any idea of how to solve it? I've checked similar posts of beautiful soup with recursion, but I always get the same 'NoneType' response.
You should check if the returned value is a NoneType before calling the recursive function. I wrote an example which you can improve upon.
from bs4 import BeautifulSoup
from datetime import datetime
import urllib2
CATALOG_URL = "http://nomads.ncep.noaa.gov:9090/dods/gfs_0p25/"
today = datetime.today().strftime('%Y')
cache = {}
def cached(func):
def wraps(url):
if url not in cache:
cache[url] = True
return func(url)
return wraps
#cached
def links_from_url(url):
html_page = urllib2.urlopen(url)
soup = BeautifulSoup(html_page, "lxml")
s = set([link.get('href') for link in soup.findAll('a') if today in link.get('href')])
return s if len(s) else url
def crawl(links):
if not links: # Checking for NoneType
return
if type(links) is str:
return links
if len(links) > 1:
return [crawl(links_from_url(link)) for link in links]
if __name__ == '__main__':
crawl(links_from_url(CATALOG_URL))
print cache.keys()
I'm working through a scraping task in Python using BeautifulSoup and am getting some strange errors. It's mentioning strip, which I'm not using, but I'm guessing might be related to the processes of BSoup?
In the task I'm trying to go to the original url, find the 18th link, click that link 7 times, and then return the name result for the 18th link on the 7th page. I'm trying to use a function to get the href from the 18th link, then adjust the global variable to recurse through with a different url each time. Any advice on what I'm missing would be really helpful. I'll list the code and errors:
from bs4 import BeautifulSoup
import urllib
import re
nameList = []
urlToUse = "http://python-data.dr-chuck.net/known_by_Basile.html"
def linkOpen():
global urlToUse
html = urllib.urlopen(urlToUse)
soup = BeautifulSoup(html, "lxml")
tags = soup("li")
count = 0
for tag in tags:
if count == 17:
tagUrl = re.findall('href="([^ ]+)"', str(tag))
nameList.append(tagUrl)
urlToUse = tagUrl
count = count + 1
else:
count = count + 1
continue
bigCount = 0
while bigCount < 9:
linkOpen()
bigCount = bigCount + 1
print nameList[8]
Errors:
Traceback (most recent call last):
File "assignmentLinkScrape.py", line 26, in <module>
linkOpen()
File "assignmentLinkScrape.py", line 10, in linkOpen
html = urllib.urlopen(urlToUse)
File
"/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 87, in urlopen
return opener.open(url) File
"/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 185, in open
fullurl = unwrap(toBytes(fullurl)) File
"/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 1075, in unwrap
url = url.strip() AttributeError: 'list' object has no attribute 'strip'
re.findall() returns a list of matches. urlToUse is a list and you are trying to pass it to urlopen() which expects a URL string instead.
Alexce has explained your error but you don't need a regex at all, you just want to get the 18th li tag and extract the href from the anchor tag inside that, you can use find with find_all:
from bs4 import BeautifulSoup
import requests
soup = BeautifulSoup(requests.get("http://python-data.dr-chuck.net/known_by_Basile.html").content,"lxml")
url = soup.find("ul").find_all("li", limit=18)[-1].a["href"]
Or use a css selector:
url = soup.select_one("ul li:nth-of-type(18) a")["href"]
So to get the name after visiting the url seven times, put the logic in a function, visit the intial url then visit and extract the anchor seven times, then on the last visit just extract the text from the anchor:
from bs4 import BeautifulSoup
import requests
soup = BeautifulSoup(requests.get("http://python-data.dr-chuck.net/known_by_Basile.html").content,"lxml")
def get_nth(n, soup):
return soup.select_one("ul li:nth-of-type({}) a".format(n))
start = get_nth(18, soup)
for _ in range(7):
soup = BeautifulSoup(requests.get(start["href"]).content,"html.parser")
start = get_nth(18, soup)
print(start.text)
From an online python course:
You will be given a website with 100 names. All names are in the form of a link. Each link leads to another 100 links. You must use python to select the 18th link for 7 times, and print out the results.
my code so far:
z = 0
atags = []
listurl = []
#import modules
import urllib
from bs4 import BeautifulSoup
import re
newurl = "https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Desmond.html"
while z < 7:
url = newurl
z = z + 1
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
soup.find_all("url")
a = soup.find_all('a')
for x in a:
atags.append(str(x))
url_end_full = atags[19]
url_end = re.findall(r'"(.*?)"', url_end_full)
url_end = str(url_end[0])
newurl = 'https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/' + url_end
str(newurl)
listurl.append(newurl)
url = newurl
print url
It does not work. It keeps giving me the same link...
this is the output:
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Lauchlin.html
[Finished in 2.4s]
the answer was wrong when i entered it into the answer box.
There are a couple of problems.
atags[19] is not the 18th item, it is the 20th (lst[0] is the first item in a list).
soup.find_all("url") does nothing; get rid of it.
you do not need re.
The links returned are relative; you are doing a hard-join to the base path to make them absolute. In this case it works, but that is a matter of luck; do it right with urljoin.
While str(link) does get you the url, the "proper" method is by indexing into the attributes, ie link['href'].
With some judicious cleanup,
from bs4 import BeautifulSoup
import sys
# version compatibility shim
if sys.hexversion < 0x3000000:
# Python 2.x
from urlparse import urljoin
from urllib import urlopen
else:
# Python 3.x
from urllib.parse import urljoin
from urllib.request import urlopen
START_URL = "https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Desmond.html"
STEPS = 7
ITEM = 18
def get_soup(url):
with urlopen(url) as page:
return BeautifulSoup(page.read(), 'lxml')
def main():
url = START_URL
for step in range(STEPS):
print("\nStep {}: looking at '{}'".format(step, url))
# get the right item (Python arrays start indexing at 0)
links = get_soup(url).find_all("a")
rel_url = links[ITEM - 1]["href"]
# convert from relative to absolute url
url = urljoin(url, rel_url)
print(" go to '{}'".format(url))
if __name__=="__main__":
main()
which, if I did it right, ends with known_by_Gideon.html