Python BeautifulSoup - extract URLs and request pages and then retrieve abstracts - python

I want to access the E-journal page and then retrieve every abstract of the articles.
So I wrote the code that makes a list of the URLs of abstract pages. And it works successfully.
But when I tried to request the URLs and retrieve the abstracts, it didn't work. (with many 'None' in the console.)
This is my code.
import requests
from bs4 import BeautifulSoup
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
URL = "https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7"
JAGS_result = requests.get(URL, headers=h)
JAGS_soup = BeautifulSoup(JAGS_result.text, "html.parser")
L = []
for link in JAGS_soup.find_all('a',{"title":"Abstract"}):
L.append(link.get('href'))
Ab_Links = []
a = 0
for ab_link in L:
if a == len(L):
break
else:
full_link = "https://agsjournals.onlinelibrary.wiley.com"+L[a]
Ab_Links.append(full_link)
a = a+1
print(Ab_Links)
b = 0
Ab = []
Ab_URL = Ab_Links[b]
for ab_url in Ab_Links:
if b == len(L):
break
else:
Ab_result = requests.get(Ab_Links[b], headers = h)
Ab_soup = BeautifulSoup(Ab_result.text, "html.parser")
abstract = Ab_soup.find({"class" : "article-section article-section__abstract"})
Ab.append(abstract)
b = b+1
print(Ab)
I am a novice to python and HTML so it is very hard to write code by myself. Please help me...

import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url)
soup = BeautifulSoup(
r.content, 'lxml', parse_only=SoupStrainer('a', title='Abstract'))
links = [urljoin(url, x['href']) for x in soup.select('a')]
for link in links:
r = req.get(link)
soup = BeautifulSoup(r.text, 'lxml')
print(soup.select_one('.article-section.article-section__abstract'))
if __name__ == "__main__":
main('https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7')

You could try this out.
This prints the abstract of all the articles in the page.
import requests
import bs4 as bs
url = 'https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7'
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
resp = requests.get(url, headers=h)
soup = bs.BeautifulSoup(resp.text, 'lxml')
base_url = 'https://agsjournals.onlinelibrary.wiley.com'
abstract_urls = soup.findAll('a', attrs= {'title': 'Abstract'})
for i in abstract_urls:
a_url = base_url + i['href']
r = requests.get(a_url,headers=h)
soup = bs.BeautifulSoup(r.text, 'lxml')
abs_text = soup.find('section', class_='article-section article-section__full').text.strip()
print(abs_text)

Your code is mostly correct. The problem is with finding the abstract. In order to search for an element by class, use class_='...'. If you change your Abstract = line to the following, it will return results:
abstract = Ab_soup.find(class_='article-section article-section__abstract')
Also, you can simplify your loops. for ab_link in L will iterate through each item in L and then stop. You do not need to test if a == len(L), and in fact that code will never be True, because the loop will exit before a == len(L).

Related

Can't retrieve an email from a webpage using the requests module

I'm trying to fetch an email from a webpage using requests module. The problem is, the email address seems to be encoded or something, which is why it is unreadable, and I wish to decode it in its usual form.
import requests
from bs4 import BeautifulSoup
link = 'https://global-standard.org/find-suppliers-shops-and-inputs/certified-suppliers/database/search_result/38996'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
res = requests.get(link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
email = soup.select_one("script[type='text/javascript']:-soup-contains('emailProtector')").contents[0]
print(email)
When I run the above script, the following is what I get:
emailProtector.addCloakedMailto("ep_586c4771", 1);
This is the result I'm after:
fttextilegroup2017#gmail.com
You can try:
import re
import requests
from bs4 import BeautifulSoup
url = 'https://global-standard.org/find-suppliers-shops-and-inputs/certified-suppliers/database/search_result/38996'
def decloak(cloaked_tag, attr_name):
a, b = "" , ""
for span in cloaked_tag.select('span'):
for attr in span.attrs:
if attr == attr_name:
a += span[attr]
else:
b = span[attr] + b
return a + b
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
attr_name = re.search(r'nodeName\.toLowerCase\(\)\.indexOf\("(.*?)"', str(soup)).group(1)
mail = decloak(soup.select_one('.cloaked_email'), attr_name)
print(mail)
Prints:
fttextilegroup2017#gmail.com

What is the fix for this Error: 'NoneType' object has no attribute 'prettify'

I want to scrape this URL https://aviation-safety.net/wikibase/type/C206.
I don't understand the meaning of this error below:
'NoneType' object has no attribute 'prettify'
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/type/C206'
req = Request(url , headers = {
'accept':'*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
data = []
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.select_one('tbody').prettify())[0])
if soup.select_one('div.pagenumbers + div a[href]'):
url = soup.select_one('div.pagenumbers + div a')['href']
else:
break
df = pd.concat(data)
df.to_csv('206.csv',encoding='utf-8-sig',index=False)
You're not using headers with requests, which is the reason you're not getting the right HTML and the table you're after is the second one, not the first. Also, I'd highly recommend to use requests over urllib.request.
So, having said that, here's how to get all the tables from all the pages:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://aviation-safety.net/wikibase/type/C206'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
}
data = []
with requests.Session() as s:
total_pages = int(
BeautifulSoup(s.get(url, headers=headers).text, "lxml")
.select("div.pagenumbers > a")[-1]
.getText()
)
for page in range(1, total_pages + 1):
print(f"Getting page: {page}...")
data.append(
pd.read_html(
s.get(f"{url}/{page}", headers=headers).text,
flavor="lxml",
)[1]
)
df = pd.concat(data)
df.to_csv('206.csv', sep=";", index=False)

how to get the content of a title using BeautifulSoup4 and requests

So i have taken the title of the medicines from this link : Medicines List
now i want to get the content for every medicines meanwhile every medicines has it owns link
Example :
Medicines Example
how can I get the content of that medicines using BeautifulSoup4 and requests library?
import requests
from bs4 import BeautifulSoup
from pprint import pp
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
title = [x.text for x in soup.select(
'a[class$=section__item-link]')]
count = 0
for x in range (0, len(title)):
count += 1
print("{0}. {1}\n".format(count, title[x]))
main('https://www.klikdokter.com/obat')
Based on what I can see as the response from https://www.klikdokter.com/obat you should be able to do something like this:-
import requests
from bs4 import BeautifulSoup
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
BASEURL = 'https://www.klikdokter.com/obat'
headers = {'User-Agent': AGENT}
response = requests.get(BASEURL, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for tag in soup.find_all('a', class_='topics-index--section__item-link'):
href = tag.get('href')
if href is not None:
print(href)
response = requests.get(href, headers=headers)
response.raise_for_status()
""" Do your processing here """

Exception has occurred: TypeError in Python

I am very new to coding so I am sorry this is a dumb question. I keep getting an error every time I try to run this code for a Python scraper. Any help would be great.
Exception has occurred: TypeError
'module' object is not callable
File "C:\Users\quawee\OneDrive\seaporn.org-scraper\seaporn.org-scraper.py", line 33, in <module>
articles = requests(x)
from this code....
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
articlelist = []
def request(x):
url = f'https://www.seaporn.org/category/hevc/page/{x}/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, features='lxml')
return soup.find_all('article', class_ = 'post-summary')
def parse(articles):
for item in articles:
link = item.find({'a': 'entry-link'})
article = {
'link': link['href']
}
articlelist.append(article)
def output():
df = pd.DataFrame(articlelist)
df.to_excel('articlelist.xlsx', index=False)
print('Saved to xlsx.')
x = 5000
while True:
print(f'Page {x}')
articles = requests(x)
x = x + 1
time.sleep(3)
if len(articles) != 0:
parse(articles)
else:
break
print('Completed, total articles is', len(articlelist))
output()
The name of your defined function is request(x). You are calling requests(x) inside the while loop.
This should work, I just corrected the spelling:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
articlelist = []
def request(x):
url = f'https://www.seaporn.org/category/hevc/page/{x}/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, features='lxml')
return soup.find_all('article', class_ = 'post-summary')
def parse(articles):
for item in articles:
link = item.find({'a': 'entry-link'})
article = {
'link': link['href']
}
articlelist.append(article)
def output():
df = pd.DataFrame(articlelist)
df.to_excel('articlelist.xlsx', index=False)
print('Saved to xlsx.')
x = 5000
while True:
print(f'Page {x}')
articles = request(x)
x = x + 1
time.sleep(3)
if len(articles) != 0:
parse(articles)
else:
break
print('Completed, total articles is', len(articlelist))
output()

Scrape and return a value from within a div class with Python

Any idea how can i retrieve the price (now 2917.99) from this source code view-https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/
If I call the class p.product-new-price i get None.
I have managed to get the title, but not the price.
What I have done so far:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get(URL, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
div = soup.find('div', {"class" : 'product-new-price'})
text = div.string
print(text)
The class looks like below and I want to extract the 2917 as int.
div class="product-highlight product-page-pricing"
p class="product-new-price"
2.917<sup>99</sup> <span>Lei</span>
Thank you very much!
Ok, with minor modifications:
It seems that the class product-new-price is on the p element for me!
I am assuming there will always be a <sup> tag after the main price
import requests
from bs4 import BeautifulSoup
URL = 'https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get(URL, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
p = soup.find('p', {"class" : 'product-new-price'})
# Get the text before <sup> tag
value = p.find('sup').previousSibling.strip()
print("Value: {}".format(value))
# Keep only numbers
value = ''.join(c for c in value if c.isdigit())
price = int(value)
print("Price: {}".format(price))
The above prints:
$ python3 ./test.py
Value: 2.917
Price: 2917
Now, with small changes you can also add the missing .99 if this is required

Categories