Problem with .Get href link using scraper? - python

So I am trying to follow a video tutorial that is just a bit outdated. In the video, using href = links[idx].get('href') grabs the link, however if I use it here, it won't work. It just says none. If I just type .getText() it will grab the title.
The element for the entire href and title is Stop the proposal on mass surveillance of the EU
Here's my code:
`import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titleline')
votes = soup.select('.score')
def create_custom_hn(links, votes):
hn = []
for idx, item in enumerate(links):
title = links[idx].getText()
href = links[idx].get('href')
print(href)
#hn.append({'title': title, 'link': href})
return hn
print(create_custom_hn(links, votes))`
I tried to grab the link using .get('href')

Try to select your elements more specific and avoid using different lists there is no need for that and you have to ensure that they will have same length.
You could get all information in one go, selecting the <tr> with class athing and its next sibling.
Example
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get('https://news.ycombinator.com/news').text)
data = []
for i in soup.select('.athing'):
data.append({
'title' : i.select_one('span a').text,
'link' : i.select_one('span a').get('href'),
'score' : list(i.next_sibling.find('span').stripped_strings)[0]
})
data
Output
[{'title': 'Stop the proposal on mass surveillance of the EU',
'link': 'https://mullvad.net/nl/blog/2023/2/2/stop-the-proposal-on-mass-surveillance-of-the-eu/',
'score': '287 points'},
{'title': 'Bay 12 Games has made $7M from the Steam release of Dwarf Fortress',
'link': 'http://www.bay12forums.com/smf/index.php?topic=181354.0',
'score': '416 points'},
{'title': "Google's OSS-Fuzz expands fuzz-reward program to $30000",
'link': 'https://security.googleblog.com/2023/02/taking-next-step-oss-fuzz-in-2023.html',
'score': '31 points'},
{'title': "Connecticut Parents Arrested for Letting Kids Walk to Dunkin' Donuts",
'link': 'https://reason.com/2023/01/30/dunkin-donuts-parents-arrested-kids-cops-freedom/',
'score': '225 points'},
{'title': 'Ronin 2.0 – open-source Ruby toolkit for security research and development',
'link': 'https://ronin-rb.dev/blog/2023/02/01/ronin-2-0-0-finally-released.html',
'score': '62 points'},...]

Related

How to scrape all match / ticket info while iterating a list?

Thanks in advance guys - I'm trying to compile ticket sale information into one easy to read list or possibly filtered table, but one step at a time.
Successfully managed to write a short script to list the pages for each event:
import requests
from bs4 import BeautifulSoup
url = "https://www.liverpoolfc.com/tickets/tickets-availability"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
pages = []
for link in soup.find_all("a", class_="ticket-card fixture"):
href = link.get("href")
if href:
pages.append(href)
print("Pages:")
for page in set(pages):
print("- " + page)
Which returns
Pages:
- /tickets/tickets-availability/wolverhampton-wanderers-v-liverpool-fc-4-feb-2023-0300pm-245
- /tickets/tickets-availability/liverpool-fc-v-arsenal-8-apr-2023-0300pm-236
- /tickets/tickets-availability/liverpool-fc-v-manchester-united-4-mar-2023-0300pm-235
- /tickets/tickets-availability/liverpool-fc-v-real-madrid-21-feb-2023-0800pm-238
- /tickets/tickets-availability/liverpool-fc-v-tottenham-hotspur-29-apr-2023-0300pm-232
- /tickets/tickets-availability/liverpool-fc-v-nottingham-forest-22-apr-2023-0300pm-234
- /tickets/tickets-availability/liverpool-fc-v-fulham-18-mar-2023-0300pm-237
- /tickets/tickets-availability/newcastle-united-v-liverpool-fc-18-feb-2023-0530pm-246
- /tickets/tickets-availability/liverpool-fc-v-brentford-6-may-2023-0300pm-231
- /tickets/tickets-availability/liverpool-fc-v-aston-villa-20-may-2023-0300pm-230
- /tickets/tickets-availability/liverpool-fc-v-everton-13-feb-2023-0800pm-233
- /tickets/tickets-availability/crystal-palace-v-liverpool-fc-25-feb-2023-0745pm-247
So far so good.
But for the following code I'm only getting the first results and hoping to get about 4 sets, and trying find all just doesn't seem to work (this is just for a single page at the moment):
import requests
from bs4 import BeautifulSoup
url = "https://www.liverpoolfc.com/tickets/tickets-availability/liverpool-fc-v-everton-13-feb-2023-0800pm-233"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# Find all the elements with the desired class
ticket_sales = soup.find_all(class_="accorMenu")
# Create a list to store the extracted information
sales_list = []
# Check if any ticket sales were found
if ticket_sales:
# Iterate over each ticket sale
for accorMenuList in ticket_sales:
# Extract the desired information from the ticket sale
saletype = soup.find("span", class_="saletype").text.strip()
salename = soup.find("span", class_="salename").text.strip()
prereqs = soup.find("span", class_="prereqs").text.strip()
status = soup.find("span", class_="status").text.strip()
whenavailable = soup.find("span", class_="whenavailable").text.strip()
# Store the extracted information in a dictionary
sale_info = {
"saletype": saletype,
"salename": salename,
"prereqs": prereqs,
"status": status,
"whenavailable": whenavailable
}
# Add the dictionary to the list of sales
sales_list.append(sale_info)
# Print the list of sales
for sale in sales_list:
print("Saletype:", sale["saletype"])
print("Salename:", sale["salename"])
print("Prereqs:", sale["prereqs"])
print("Status:", sale["status"])
print("Whenavailable:", sale["whenavailable"])
print("---")
else:
# If no ticket sales were found, print a message
print("No ticket sales found.")
returns:
Saletype: match ticket -
Salename: Hospitality
Prereqs:
Status: available
Whenavailable: Mon 6 Feb 2023, 11:00am
---
Your approach is already the right one, but you are subject to the following misconceptions:
ticket_sales = soup.find_all(class_="accorMenu") does not reference the individual list elements but the list itself, which leads to the fact that there is only one element in the ResultSet that can be iterated over.
Instead use soup.select('.accorMenu li') or
soup.select('.accorMenu h3') to select the individual containers.
Used css selectors here, because it makes chaining a bit easier than use of several `find()/find_all()
When iterating, do not reference soup to the global object saletype = soup.find("span", class_="saletype").text.strip() but reference the respective iteration. Otherwise you will still only get the information of the first element in soup.
Furthermore, you should always check if an element has been found at all before applying a method to it, this can be implemented with a simple if else statement
Example
import requests
from bs4 import BeautifulSoup
url = 'https://www.liverpoolfc.com/tickets/tickets-availability/liverpool-fc-v-everton-13-feb-2023-0800pm-233'
soup = BeautifulSoup(requests.get(url).text)
sales_list = []
for e in soup.select('.accorMenu h3'):
# Store the extracted information in a dictionary
sales_list.append({
"saletype": e.find("span", class_="saletype").text.strip(),
"salename": e.find("span", class_="salename").text.strip(),
"prereqs": e.find("span", class_="prereqs").text.strip(),
"status": e.find("span", class_="status").text.strip(),
"whenavailable": e.find("span", class_="whenavailable").text.strip() if e.find("span", class_="whenavailable") else None
})
sales_list
Output
[{'saletype': 'match ticket -',
'salename': 'Hospitality',
'prereqs': '',
'status': 'available',
'whenavailable': None},
{'saletype': 'match ticket -',
'salename': 'Local Members Sale',
'prereqs': 'Members with an ‘L’ Postcode',
'status': 'sold out',
'whenavailable': None},
{'saletype': 'match ticket -',
'salename': 'Local General Sale',
'prereqs': 'Supporters with an ‘L’ Postcode',
'status': 'sold out',
'whenavailable': None},
{'saletype': 'match ticket -',
'salename': 'Additional Members Sale',
'prereqs': 'Members who have recorded 4+ Premier League home games from either season 2018/19 or 2019/20',
'status': 'on sale soon',
'whenavailable': 'Mon 6 Feb 2023, 11:00am'}]

How do use the soup.find, soup.find_all

Here is my code and the output
import requests
from bs4 import BeautifulSoup
res = requests.get("https://www.jobberman.com/jobs")
soup = BeautifulSoup(res.text, "html.parser")
job = soup.find("div", class_ = "relative inline-flex flex-col w-full text-sm font-normal pt-2")
company_name = job.find('a[href*="jobs"]')
print(company_name)
output is none
None
But when i use the select method, i got the desired result but cant use .text on it
import requests
from bs4 import BeautifulSoup
res = requests.get("https://www.jobberman.com/jobs")
soup = BeautifulSoup(res.text, "html.parser")
job = soup.find("div", class_ = "relative inline-flex flex-col w-full text-sm font-normal pt-2")
company_name = job.select('a[href*="jobs"]').text
print(company_name)
output
AttributeError: ResultSet object has no attribute 'text'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Change your selection strategy - Cause main issue here is, that not all company names are linked:
job.find('div',{'class':'search-result__job-meta'}).text.strip()
or
job.select_one('.search-result__job-meta').text.strip()
Example
Also store your information in a structured way for post processing:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://www.jobberman.com/jobs")
soup = BeautifulSoup(res.text, "html.parser")
data = []
for job in soup.select('div:has(>.search-result__body)'):
data.append({
'job':job.h3.text,
'company':job.select_one('.search-result__job-meta').text.strip()
})
data
Output
[{'job': 'Restaurant Manager', 'company': 'Balkaan Employments service'},
{'job': 'Executive Assistant', 'company': 'Nolla Fresh & Frozen ltd'},
{'job': 'Portfolio Manager/Instructor 1', 'company': 'Fun Science World'},
{'job': 'Microbiologist', 'company': "NEIMETH INT'L PHARMACEUTICALS PLC"},
{'job': 'Data Entry Officer', 'company': 'Nkoyo Pharmaceuticals Ltd.'},
{'job': 'Chemical Analyst', 'company': "NEIMETH INT'L PHARMACEUTICALS PLC"},
{'job': 'Senior Front-End Engineer', 'company': 'Salvo Agency'},...]
The problems with your search strategy has been covered by comments and answers posted earlier. I am offering a solution for your problem which involves the use of regex library, along with the find_all() function call:
import requests
from bs4 import BeautifulSoup
import re
res = requests.get("https://www.jobberman.com/jobs")
soup = BeautifulSoup(res.text, "html.parser")
company_name = soup.find_all("a", href=re.compile("/jobs\?"), rel="nofollow")
for i in range(len(company_name)):
print(company_name[i].text)
Output:
GRATIAS DEI NIGERIA LIMITED
Balkaan Employments service
Fun Science World
NEIMETH INT'L PHARMACEUTICALS PLC
Nkoyo Pharmaceuticals Ltd.
...

Python BeautifulSoup Web Scraping works for the first but not second or any following times

im a Python newbie and try to script a web scraper to get my hands on some price data.
The website i am trying to scrape is for example:
https://www.medizinfuchs.de/?params%5Bsearch%5D=10192710&params%5Bsearch_cat%5D=1
I am using following code:
from bs4 import BeautifulSoup
import requests
URL = "https://www.medizinfuchs.de/?params%5Bsearch%5D=11484834&params%5Bsearch_cat%5D=1"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
for price in soup.select('li.apotheke div.price'):
print(float(price.text.strip(' \t\n€').replace(',', '.')))
for name in soup.select('li.apotheke a.name'):
print(str(name.text.strip(' \t\n€')))
It works like a charm the first time I run it, but after that I dont get an output...
The Output I expect:
5.39
5.4
5.4
5.65
5.8
5.89
5.89
5.94 ApothekePrime Apoversand24.de bon-vita.de 1-apo.de eurapon.de docmorris.de sternapo ahorn24.de
Can you help me get it to work consistently?
Thanks
What happens?
Take a look into your soup - It tells the truth. There is no <li> with class apotheke in the soup, so you wont get any result.
How to fix?
Select the right tags or skip them and focus on the classes (not the best idea, cause, classes often change, but in this case best you can do)
for e in soup.select('.apotheke'):
print(e.select_one('.price').get_text(strip=True).split(' ')[0])
for e in soup.select('.apotheke'):
print(e.select_one('.name').get_text(strip=True))
Example (more structured)
data=[]
for e in soup.select('.apotheke'):
data.append({
'name':e.select_one('.name').get_text(strip=True),
'price':e.select_one('.price').get_text(strip=True).split(' ')[0]
})
data
Output
[{'name': 'ApothekePrime', 'price': '5,39'},
{'name': 'Apoversand24.de', 'price': '5,40'},
{'name': 'bon-vita.de', 'price': '5,40'},
{'name': '1-apo.de', 'price': '5,65'},
{'name': 'eurapon.de', 'price': '5,80'},
{'name': 'docmorris.de', 'price': '5,89'},
{'name': 'sternapo', 'price': '5,89'},
{'name': 'ahorn24.de', 'price': '5,94'}]

python beautifulsoup web scraping issue

page = requests.get("http://www.freejobalert.com/upsc-recruitment/16960/#Engg-Services2019")
c = page.content
soup=BeautifulSoup(c,"html.parser")
data=soup.find_all("tr")
for r in data:
td = r.find_all("td",{"style":"text-align: center;"})
for d in td:
link =d.find_all("a")
for li in link:
span = li.find_all("span",{"style":"color: #008000;"})
for s in span:
strong = s.find_all("strong")
for st in strong:
dict['title'] = st.text
for l in link:
dict["link"] = l['href']
print(dict)
It is giving
{'title': 'Syllabus', 'link': 'http://www.upsc.gov.in/'}
{'title': 'Syllabus', 'link': 'http://www.upsc.gov.in/'}
{'title': 'Syllabus', 'link': 'http://www.upsc.gov.in/'}
I am expecting:
{'title': 'Apply Online', 'link': 'https://upsconline.nic.in/mainmenu2.php'}
{'title': 'Notification', 'link': 'http://www.freejobalert.com/wp-content/uploads/2018/09/Notification-UPSC-Engg-Services-Prelims-Exam-2019.pdf'}
{'title': 'Official Website ', 'link': 'http://www.upsc.gov.in/'}
Here i want all "Important Links" means "Apply online","Notification","official website"
and it's link for each table.
but it is giving me "Syllabus" in title instead with repeting links..
please have a look into this..
This may help you, check the code below.
import requests
from bs4 import BeautifulSoup
page = requests.get('http://www.freejobalert.com/'
'upsc-recruitment/16960/#Engg-Services2019')
c = page.content
soup = BeautifulSoup(c,"html.parser")
row = soup.find_all('tr')
dict = {}
for i in row:
for title in i.find_all('span', attrs={
'style':'color: #008000;'}):
dict['Title'] = title.text
for link in i.find_all('a', href=True):
dict['Link'] = link['href']
print(dict)

Python, Issue with Regex. Trying to scrape for some comicbook titles

I'm trying to scrape for comic book titles and their respective numbers, from this site.
But I'm having issue with Regex which I've never used before.
I don't want to bore you with my full code, suffice it say I'm using beautiful soup, and what I need from Regex is simply to point to the title name and also the episode number of each comic title, out of the list looping through.
As you can tell from the webpage this should be simplicity itself, the Publisher name comes in all caps, always followed by the title, always followed by a #-symbol, always followed by the episode number.
Here is my approach:
import re
text = "876876 PUBLISHER title #345 jklhljhljh"
texpat = re.compile(r"PUBLISHER(.*?)#")
thename = pattern.search(text)
name = thename.group()
numpat = re.compile(r"#(\d+)")
num = numpat.search(text)
print(name)
print(num.group())
The output is:
PUBLISHER title #
#345
But it should be:
title
345
I can use the replace string method to remove the stuff I don't want, but then I get stuck with this output:
title
and name.strip() or name.lstrip() does NOT remove the extra three spaces.
It's late, I've never used regex before, I'm sure I'm doing something stupid.
I would utilize BeautifulSoup here to help with html parsing:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.comiclistdatabase.com/doku.php?id=comiclist_for_09_10_2014"
soup = BeautifulSoup(urllib2.urlopen(url))
for row in soup.select('div.table tr')[1:]:
publisher = row.find('td', class_='col1').text
title = row.find('td', class_='col2').text
print {'publisher': publisher, 'title': title}
Prints:
{'publisher': u'AMIGO COMICS', 'title': u'Ghost Wolf #4 (Of 4)$3.99 '}
{'publisher': u'AMIGO COMICS', 'title': u'Rogues Volume 2 Cold Ship #4 (Of 5)'}
{'publisher': u'ARCHIE COMIC PUBLICATIONS', 'title': u'Archie Giant Comics Digest TP'}
{'publisher': u'ARCHIE COMIC PUBLICATIONS', 'title': u'Betty And Veronica #272 (Dan Parent Regular Cover)'}
...
Then, you can grab the number from the title if you want to extract it too. I'm using #(\d+) regular expression that matches a hashtag followed by 1 or more digits, parenthesis help to capture the number:
import re
import urllib2
from bs4 import BeautifulSoup
url = "http://www.comiclistdatabase.com/doku.php?id=comiclist_for_09_10_2014"
soup = BeautifulSoup(urllib2.urlopen(url))
NUMBER_RE = re.compile('#(\d+)')
for row in soup.select('div.table tr')[1:]:
publisher = row.find('td', class_='col1').text
title = row.find('td', class_='col2').text
match = NUMBER_RE.search(title)
number = match.group(1) if match else 'n/a'
print {'publisher': publisher, 'title': title, 'number': number}
Prints:
{'publisher': u'AMIGO COMICS', 'number': u'4', 'title': u'Ghost Wolf #4 (Of 4)$3.99 '}
{'publisher': u'AMIGO COMICS', 'number': u'4', 'title': u'Rogues Volume 2 Cold Ship #4 (Of 5)'}
{'publisher': u'ARCHIE COMIC PUBLICATIONS', 'number': 'n/a', 'title': u'Archie Giant Comics Digest TP'}
...
import re
text = "876876 PUBLISHER title #345 jklhljhljh"
texpat = re.compile(r"PUBLISHER\s*(\S.*?)#")
thename = texpat.search(text)
name = thename.groups()[0]
numpat = re.compile(r"#(\d+)")
num = numpat.search(text)
print(name)
print(num.groups()[0])
The output is:
title
345
Match this to capture the title (in group one) and the number (in group two) with one expression:
PUBLISHER\s*(.+?)\s*#(\d+)
Demo
Then you need to use the array pattern.search(text).group(i) to get the capture group instead of the entire match:
import re
text = "876876 PUBLISHER title #345 jklhljhljh"
pattern = re.compile(r"PUBLISHER\s*(.+?)\s*#(\d+)")
results = pattern.search(text)
print(results.group(1))
print(results.group(2))
Output:
title
345

Categories