python beautifulsoup web scraping issue - python

page = requests.get("http://www.freejobalert.com/upsc-recruitment/16960/#Engg-Services2019")
c = page.content
soup=BeautifulSoup(c,"html.parser")
data=soup.find_all("tr")
for r in data:
td = r.find_all("td",{"style":"text-align: center;"})
for d in td:
link =d.find_all("a")
for li in link:
span = li.find_all("span",{"style":"color: #008000;"})
for s in span:
strong = s.find_all("strong")
for st in strong:
dict['title'] = st.text
for l in link:
dict["link"] = l['href']
print(dict)
It is giving
{'title': 'Syllabus', 'link': 'http://www.upsc.gov.in/'}
{'title': 'Syllabus', 'link': 'http://www.upsc.gov.in/'}
{'title': 'Syllabus', 'link': 'http://www.upsc.gov.in/'}
I am expecting:
{'title': 'Apply Online', 'link': 'https://upsconline.nic.in/mainmenu2.php'}
{'title': 'Notification', 'link': 'http://www.freejobalert.com/wp-content/uploads/2018/09/Notification-UPSC-Engg-Services-Prelims-Exam-2019.pdf'}
{'title': 'Official Website ', 'link': 'http://www.upsc.gov.in/'}
Here i want all "Important Links" means "Apply online","Notification","official website"
and it's link for each table.
but it is giving me "Syllabus" in title instead with repeting links..
please have a look into this..

This may help you, check the code below.
import requests
from bs4 import BeautifulSoup
page = requests.get('http://www.freejobalert.com/'
'upsc-recruitment/16960/#Engg-Services2019')
c = page.content
soup = BeautifulSoup(c,"html.parser")
row = soup.find_all('tr')
dict = {}
for i in row:
for title in i.find_all('span', attrs={
'style':'color: #008000;'}):
dict['Title'] = title.text
for link in i.find_all('a', href=True):
dict['Link'] = link['href']
print(dict)

Related

Problem with .Get href link using scraper?

So I am trying to follow a video tutorial that is just a bit outdated. In the video, using href = links[idx].get('href') grabs the link, however if I use it here, it won't work. It just says none. If I just type .getText() it will grab the title.
The element for the entire href and title is Stop the proposal on mass surveillance of the EU
Here's my code:
`import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titleline')
votes = soup.select('.score')
def create_custom_hn(links, votes):
hn = []
for idx, item in enumerate(links):
title = links[idx].getText()
href = links[idx].get('href')
print(href)
#hn.append({'title': title, 'link': href})
return hn
print(create_custom_hn(links, votes))`
I tried to grab the link using .get('href')
Try to select your elements more specific and avoid using different lists there is no need for that and you have to ensure that they will have same length.
You could get all information in one go, selecting the <tr> with class athing and its next sibling.
Example
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get('https://news.ycombinator.com/news').text)
data = []
for i in soup.select('.athing'):
data.append({
'title' : i.select_one('span a').text,
'link' : i.select_one('span a').get('href'),
'score' : list(i.next_sibling.find('span').stripped_strings)[0]
})
data
Output
[{'title': 'Stop the proposal on mass surveillance of the EU',
'link': 'https://mullvad.net/nl/blog/2023/2/2/stop-the-proposal-on-mass-surveillance-of-the-eu/',
'score': '287 points'},
{'title': 'Bay 12 Games has made $7M from the Steam release of Dwarf Fortress',
'link': 'http://www.bay12forums.com/smf/index.php?topic=181354.0',
'score': '416 points'},
{'title': "Google's OSS-Fuzz expands fuzz-reward program to $30000",
'link': 'https://security.googleblog.com/2023/02/taking-next-step-oss-fuzz-in-2023.html',
'score': '31 points'},
{'title': "Connecticut Parents Arrested for Letting Kids Walk to Dunkin' Donuts",
'link': 'https://reason.com/2023/01/30/dunkin-donuts-parents-arrested-kids-cops-freedom/',
'score': '225 points'},
{'title': 'Ronin 2.0 – open-source Ruby toolkit for security research and development',
'link': 'https://ronin-rb.dev/blog/2023/02/01/ronin-2-0-0-finally-released.html',
'score': '62 points'},...]

using bs4 get garbled code when find values inside tag

def getAllBooksPagesURLs():
lists_of_url = []
lists_of_url.append(r"http://books.toscrape.com/")
for j in range(2,51):
lists_of_url.append(r"http://books.toscrape.com/catalogue/page-%d.html"%j)
return lists_of_url
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return soup
def getBooksURLs(url,z):
soup = getAndParseURL(url)
return([z+ x.a.get('href') for x in soup.findAll( "div", class_="image_container")])
books_url = []
title_list = []
main_page_list = []
list_of_rewiew_num = []
list_of_bookpage = []
list_of_resultitle = []
books_done_page = []
list_of_review_num=[]
for y in getAllBooksPagesURLs()[0:1]:
main_page=getAndParseURL(y)
result_of_title = main_page.findAll("h3")
for x in result_of_title:
list_of_resultitle.append(x.find("a").get("title"))
books_url = getBooksURLs(y,y)
for b in books_url:
print(b)
books_page = getAndParseURL(b)
if books_page.find("td") is None:
list_of_review_num.append(0)
else:
review_num =books_page.find("td").contents[0]
list_of_review_num.append(review_num)
books_url
list_of_resultitle
list_of_review_num
above is my code ,the result is
['a897fe39b1053632',
'90fa61229261140a',
'6957f44c3847a760',
'e00eb4fd7b871a48',
'4165285e1663650f',
'f77dbf2323deb740',
'2597b5a345f45e1b',
'e72a5dfc7e9267b2',
'e10e1e165dc8be4a',
'1dfe412b8ac00530',
'0312262ecafa5a40',
'30a7f60cd76ca58c',
'ce6396b0f23f6ecc',
'3b1c02bac2a429e6',
'a34ba96d4081e6a4',
'deda3e61b9514b83',
'feb7cc7701ecf901',
'e30f54cea9b38190',
'a18a4f574854aced',
'a22124811bfa8350']
the garble codes are like 'a22124811bfa8350', is it about dynamic html? I donnot know.
my desire output of list_of_review_num should be
[0,1,2,3]
how to get the correct output?could you plz help me? thank u in advance
The reason your code is outputting the result that you have is that you are using .find() which will find the first occurrence of the td tag, since there are numerous tags on the page you are working with and that the reviews would be the last td tag you should do something like this.
if books_page.find("td") is None: # saying that there is no td tags at all
list_of_review_num.append(0)
else:
review_num = books_page.find_all("td")[-1].contents[0] # using find_all and accessing the last td tag element
list_of_review_num.append(review_num)
Issue here, you select upc information and not the reviews. I recommend to avoid all these lists to store your results, better use dicts instead:
data = []
for y in getAllBooksPagesURLs()[0:1]:
main_page=getAndParseURL(y)
books_url = getBooksURLs(y,y)
for b in books_url:
books_page = getAndParseURL(b)
d = {
'title': books_page.h1.text,
'url':b
}
d.update(dict(x.stripped_strings for x in books_page.select('table tr')))
data.append(d)
data
Example
import requests
from bs4 import BeautifulSoup
def getAllBooksPagesURLs():
lists_of_url = []
lists_of_url.append(r"http://books.toscrape.com/")
for j in range(2,51):
lists_of_url.append(r"http://books.toscrape.com/catalogue/page-%d.html"%j)
return lists_of_url
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return soup
def getBooksURLs(url,z):
soup = getAndParseURL(url)
return([z+ x.a.get('href') for x in soup.find_all( "div", class_="image_container")])
data = []
for y in getAllBooksPagesURLs()[0:1]:
books_url = getBooksURLs(y,y)
for b in books_url:
books_page = getAndParseURL(b)
d = {
'title': books_page.h1.text,
'url':b
}
d.update(dict(x.stripped_strings for x in books_page.select('table tr')))
data.append(d)
data
Output:
[{'title': 'A Light in the Attic',
'url': 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
'UPC': 'a897fe39b1053632',
'Product Type': 'Books',
'Price (excl. tax)': '£51.77',
'Price (incl. tax)': '£51.77',
'Tax': '£0.00',
'Availability': 'In stock (22 available)',
'Number of reviews': '0'},
{'title': 'Tipping the Velvet',
'url': 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
'UPC': '90fa61229261140a',
'Product Type': 'Books',
'Price (excl. tax)': '£53.74',
'Price (incl. tax)': '£53.74',
'Tax': '£0.00',
'Availability': 'In stock (20 available)',
'Number of reviews': '0'},...]
Note: In newer code avoid old syntax findAll() instead use find_all() or select() with css selectors - For more take a minute to check docs

BeautifulSoup - Converting string values into int inside of nested for loop then sort

I'm trying to figure out how to convert a string value into an int within a scraped for loop in order to sort by the int ('views' within the below script).
Below is a condensed view of the problem. Inclduing a working script that returns the string, my failed attempt to fix the issue, and the desired output.
Working script that returns the string:
import requests
from bs4 import BeautifulSoup
import pprint
res = requests.get('https://www.searchenginejournal.com/category/news/')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.find_all('h2', class_='sej-ptitle')
subtext = soup.find_all('ul', class_='sej-meta-cells')
def sort_stories_by_views(sejlist):
return sorted(sejlist, key=lambda k: k['views'], reverse=True)
def create_custom_sej(links, subtext):
sej = []
for idx, item in enumerate(links):
title = links[idx].getText()
href = links[idx].a.get('href', None)
views = subtext[idx].find_all(
'li')[2].text.strip().replace(' Reads', '')
sej.append({'title': title, 'link': href, 'views': views})
return sort_stories_by_views(sej)
create_custom_sej(links, subtext)
pprint.pprint(create_custom_sej(links, subtext))
Within the above, the output contains dictionaries that look like the below:
{
'link': 'https://www.searchenginejournal.com/google-answers-if-site-section-can-impact-ranking-scores-of-
'title': 'Google Answers If Site Section Can Impact Ranking Score of Entire ''Site ',
'views': '4.5K'
}
The desired output would be:
{
'link': 'https://www.searchenginejournal.com/google-answers-if-site-section-can-impact-ranking-scores-of-
'title': 'Google Answers If Site Section Can Impact Ranking Score of Entire ''Site ',
'views': '4500'
}
My failed attempt to fix the problem is below. The below script returns a single value, rather than the list of all applicable values, but i'm honestly not certain if i'm going about this in the correct way.
import requests
from bs4 import BeautifulSoup
import pprint
res = requests.get('https://www.searchenginejournal.com/category/news/')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.find_all('h2', class_='sej-ptitle')
subtext = soup.find_all('ul', class_='sej-meta-cells')
def sort_stories_by_views(sejlist):
return sorted(sejlist, key=lambda k: k['views'], reverse=True)
def create_custom_sej(links, subtext):
sej = []
for idx, item in enumerate(links):
title = links[idx].getText()
href = links[idx].a.get('href', None)
views = subtext[idx].find_all(
'li')[2].text.strip().replace(' Reads', '').replace(' min read', '')
# below is my unsuccessful attempt to change the strings to int
for item in views:
if views:
multiplier = 1
if views.endswith('K'):
multiplier = 1000
views = views[0:len(views)-1]
return int(float(views) * multiplier)
else:
return views
sej.append({'title': title, 'link': href, 'views': views})
return sort_stories_by_views(sej)
create_custom_sej(links, subtext)
pprint.pprint(create_custom_sej(links, subtext))
Any help would be appreciated!
Thanks.
You can try this code to convert the views to integer:
import requests
from bs4 import BeautifulSoup
import pprint
res = requests.get('https://www.searchenginejournal.com/category/news/')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.find_all('h2', class_='sej-ptitle')
subtext = soup.find_all('ul', class_='sej-meta-cells')
def convert(views):
if 'K' in views:
return int( float( views.split('K')[0] ) * 1000 )
else:
return int(views)
def sort_stories_by_views(sejlist):
return sorted(sejlist, key=lambda k: k['views'], reverse=True)
def create_custom_sej(links, subtext):
sej = []
for idx, item in enumerate(links):
title = links[idx].getText()
href = links[idx].a.get('href', None)
views = item.parent.find('i', class_='sej-meta-icon fa fa-eye')
views = views.find_next(text=True).split()[0] if views else '0'
sej.append({'title': title, 'link': href, 'views': convert(views)})
return sort_stories_by_views(sej)
create_custom_sej(links, subtext)
pprint.pprint(create_custom_sej(links, subtext))
Prints:
[{'link': 'https://www.searchenginejournal.com/microsoft-clarity-analytics/385867/',
'title': 'Microsoft Announces Clarity – Free Website '
'Analytics ',
'views': 11000},
{'link': 'https://www.searchenginejournal.com/wordpress-5-6-feature-removed-for-subpar-experience/385414/',
'title': 'WordPress 5.6 Feature Removed For Subpar '
'Experience ',
'views': 7000},
{'link': 'https://www.searchenginejournal.com/whatsapp-shopping-payment-customer-service/385362/',
'title': 'WhatsApp Announces Shopping and Payment Tools for '
'Businesses ',
'views': 6800},
{'link': 'https://www.searchenginejournal.com/google-noindex-meta-tag-proper-use/385538/',
'title': 'Google Shares How Noindex Meta Tag Can Cause '
'Issues ',
'views': 6500},
...and so on.

Why Python BeautifulSoup returns a empty list?

I'm a rookie student of IT, I was trying to help my Friend with his job and I wanted to create a list of costumers he could serve (maybe exporting it in a file would be awesome too but I will think about it later I guess).
When I try to run the code it just returns an empty list, do you have any suggestions?
any suggestions/feedback would be highly appreciated!
Thank you!
(I know maybe it isn't the best code you have ever seen! so I apologize myself in advance!)
import requests
from bs4 import BeautifulSoup
import pprint
res = requests.get('https://www.paginebianche.it/toscana/li/gommisti.html')
res2 = requests.get('https://www.paginebianche.it/ricerca?qs=gommisti&dv=li&p=2')
soup = BeautifulSoup(res.text, 'html.parser')
soup2 = BeautifulSoup(res2.text, 'html.parser')
links = soup.select('.org fn')
subtext = soup.select('.address')
links2 = soup2.select('.org fn')
subtext2 = soup2.select('.address')
mega_links = links + links2
mega_subtext = subtext + subtext2
def create_custom_hn(mega_links,mega_subtext):
hn = []
for links,address in enumerate(mega_links):
title = links.getText()
address= address.getText()
hn.append({'title': title, 'address': address})
return hn
pprint.pprint(create_custom_hn(mega_links,mega_subtext))
The selector .org fn is wrong, should be .org.fn to select all elements with class org and fn.
However, some items don't have .address so, your code produces skewed results. You can use this example to get title and addresses (in case of missing address, - is used):
import pprint
import requests
from itertools import chain
from bs4 import BeautifulSoup
res = requests.get('https://www.paginebianche.it/toscana/li/gommisti.html')
res2 = requests.get('https://www.paginebianche.it/ricerca?qs=gommisti&dv=li&p=2')
soup = BeautifulSoup(res.text, 'html.parser')
soup2 = BeautifulSoup(res2.text, 'html.parser')
hn = []
for i in chain.from_iterable([soup.select('.item'), soup2.select('.item')]):
title = i.h2.getText(strip=True)
addr = i.select_one('[itemprop="address"]')
addr = addr.getText(strip=True, separator='\n') if addr else '-'
hn.append({'title': title, 'address': addr})
pprint.pprint(hn)
Prints:
[{'address': 'Via Don Giovanni Minzoni 44\n-\n57025\nPiombino (LI)',
'title': 'CENTROGOMMA'},
{'address': 'Via Quaglierini 14\n-\n57123\nLivorno (LI)',
'title': 'F.LLI CAPALDI'},
{'address': 'Via Ugione 9\n-\n57121\nLivorno (LI)',
'title': 'PNEUMATICI INTERGOMMA GOMMISTA'},
{'address': "Viale Carducci Giosue' 88/90\n-\n57124\nLivorno (LI)",
'title': 'ITALMOTORS'},
{'address': 'Piazza Chiesa 53\n-\n57124\nLivorno (LI)',
'title': 'Lo Coco Pneumatici'},
{'address': '-', 'title': 'PIERO GOMME'},
{'address': 'Via Pisana Livornese Nord 95\n-\n57014\nVicarello (LI)',
'title': 'GOMMISTA TRAVAGLINI PNEUMATICI'},
{'address': 'Via Cimarosa 165\n-\n57124\nLivorno (LI)',
'title': 'GOMMISTI CIONI AUTORICAMBI & SERVIZI'},
{'address': 'Loc. La Cerretella, 219\n-\n57022\nCastagneto Carducci (LI)',
'title': 'AURELIA GOMME'},
{'address': 'Strada Provinciale Vecchia Aurelia 243\n'
'-\n'
'57022\n'
'Castagneto Carducci (LI)',
'title': 'AURELIA GOMME DI GIANNELLI SIMONE'},
...and so on.

How Can I follow A Link to a specific post and scrape the data from the

I am trying to follow links from posts that I have scraped so I can save the text. I am partly there. I just need to tweak some things which is why I'm here. Instead of different posts I am getting duplicates. and not only that, they are surrounded in brackets like this
[[<div class="article-body" id="image-description"><p>Kanye West premiered
the music video for "Famous" off his "The Life of Pablo" album to a
sold out audience in Los Angeles. The video features nude versions of George W. Bush.
Donald Trump. Anna Wintour. Rihanna. Chris Brown. Taylor Swift.
Kanye West. Kim Kardashian. Ray J. Amber Rose. Caitlyn Jenner.
Bill Cosby (in that order).</p></div>],
and heres my code
def sprinkle():
url_two = 'http://www.example.com'
html = requests.get(url_two, headers=headers)
soup = BeautifulSoup(html.text, 'html5lib')
titles = soup.find_all('div', {'class': 'entry-pos-1'})
def make_soup(url):
the_comments_page = requests.get(url, headers=headers)
soupdata = BeautifulSoup(the_comments_page.text, 'html5lib')
comment = soupdata.find_all('div', {'class': 'article-body'})
return comment
comment_links = [url_two + link.a.get('href') for link in titles]
soup = [make_soup(comments) for comments in comment_links]
# soup = make_soup(comments)
# print(soup)
entries = [{'href': url_two + div.a.get('href'),
'src': url_two + div.a.img.get('data-original'),
'text': div.find('p', 'entry-title').text,
'comments': soup
} for div in titles][:6]
return entries
I feel like I'm close. this is all new to me. Any help would be great.
I figured it out
def sprinkle():
url_two = 'http://www.vladtv.com'
html = requests.get(url_two, headers=headers)
soup = BeautifulSoup(html.text, 'html5lib')
titles = soup.find_all('div', {'class': 'entry-pos-1'})
def make_soup(url):
the_comments_page = requests.get(url, headers=headers)
soupdata = BeautifulSoup(the_comments_page.text, 'html5lib')
comment = soupdata.find('div', {'class': 'article-body'})
para = comment.find_all('p')
return para
entries = [{'href': url_two + div.a.get('href'),
'src': url_two + div.a.img.get('data-original'),
'text': div.find('p', 'entry-title').text,
'comments': make_soup(url_two + div.a.get('href'))
} for div in titles][:6]
return entries
I'm trying to remove the brackets though from the result though

Categories