how to get the content of a title using BeautifulSoup4 and requests - python

So i have taken the title of the medicines from this link : Medicines List
now i want to get the content for every medicines meanwhile every medicines has it owns link
Example :
Medicines Example
how can I get the content of that medicines using BeautifulSoup4 and requests library?
import requests
from bs4 import BeautifulSoup
from pprint import pp
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
title = [x.text for x in soup.select(
'a[class$=section__item-link]')]
count = 0
for x in range (0, len(title)):
count += 1
print("{0}. {1}\n".format(count, title[x]))
main('https://www.klikdokter.com/obat')

Based on what I can see as the response from https://www.klikdokter.com/obat you should be able to do something like this:-
import requests
from bs4 import BeautifulSoup
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
BASEURL = 'https://www.klikdokter.com/obat'
headers = {'User-Agent': AGENT}
response = requests.get(BASEURL, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for tag in soup.find_all('a', class_='topics-index--section__item-link'):
href = tag.get('href')
if href is not None:
print(href)
response = requests.get(href, headers=headers)
response.raise_for_status()
""" Do your processing here """

Related

getting an empty list when trying to extract urls from google with beautifulsoup

I am trying to extract the first 100 urls that return from a location search in google
however i am getting an empty list every time ("no results found")
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = []
if results:
counter = 0
for result in results:
websites.append(result.find("a")["href"])
counter += 1
if counter == 100:
break
else:
print("No search results found.")
return websites
location = "Athens"
print(get_location_info(location))
No search results found.
[]
I have also tried this approach :
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = [result.find("a")["href"] for result in results][:10]
return websites
location = "sifnos"
print(get_location_info(location))`
and i get an empty list. I think i am doing everything suggested in similar posts but i still get nothing
Always and first of all, take a look at your soup to see if all the expected ingredients are in place.
Select your elements more specific in this case for example with css selector:
[a.get('href') for a in soup.select('a:has(>h3)')]
To void consent banner also send some cookies:
cookies={'CONSENT':'YES+'}
Example
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers, cookies={'CONSENT':'YES+'})
soup = BeautifulSoup(response.text, 'html.parser')
websites = [a.get('href') for a in soup.select('a:has(>h3)')]
return websites
location = "sifnos"
print(get_location_info(location))
Output
['https://www.griechenland.de/sifnos/', 'http://de.sifnos-greece.com/plan-trip-to-sifnos/travel-information.php', 'https://www.sifnosisland.gr/', 'https://www.visitgreece.gr/islands/cyclades/sifnos/', 'http://www.griechenland-insel.de/Hauptseiten/sifnos.htm', 'https://worldonabudget.de/sifnos-griechenland/', 'https://goodmorningworld.de/sifnos-griechenland/', 'https://de.wikipedia.org/wiki/Sifnos', 'https://sifnos.gr/en/sifnos/', 'https://www.discovergreece.com/de/cyclades/sifnos']

How to get all 'href' with soup in python ? I try so many times but not work

How to get all 'href' with soup in python ? I try so many times but in vain.
Whatever I use 'soup.find' or 'soup.find_all' method to strugle for the 'href', it doesn't work.
python version:3.10
!pip install requests
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
productlink = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
for page in range(1,2):
url = "https://www.momomall.com.tw/s/103487/dcategory/all/3/{page}"
r = requests.get(url, headers = headers)
Soup = BeautifulSoup(r.text,"lxml")
for link in Soup.find_all('ul',class_="searchItem Stype"):
print(len(link))
Link = link.li.a
LINK = Link.get('href')
print(LINK)
productlink.append(LINK)
print(productlink)
sorry i misunderstood totally your problem. find_all is not a very versatile tool and you were searching for the wrong ul
i barely changed your code but it seems to work now
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
productlink = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
for page in range(1,2):
url = f"https://www.momomall.com.tw/s/103487/dcategory/all/3/{page}"
r = requests.get(url, headers = headers)
Soup = BeautifulSoup(r.text,"lxml")
for link in Soup.select('ul#surveyContent > li > a[href]:first-of-type'):
print(len(link))
# ~ Link = link.li.a
LINK = link.get('href')
print(LINK)
productlink.append(LINK)
print(productlink)
for page in range(1,2):
url = "https://m.momomall.com.tw/m/store/DCategory.jsp?entp_code=103487&category_code=all&orderby=3&page={}".format(page)
r = requests.get(url,headers = headers)
soup = BeautifulSoup(r.text,'lxml')
for goods_code in soup.select('a.nofollowBtn_star'):
Goods_code = 'https://www.momomall.com.tw/s/103487/'+goods_code.get('goods_code')+'/'
goodlink.append(Goods_code)
for URL in goodlink:
R = requests.get(URL, headers = headers)
Soup = BeautifulSoup(R.text,"lxml")
for dataprice in Soup.select('script'):
import re
discount_regex=re.compile('discountPrice = (\d{1,5})')
print(re.search(discount_regex, dataprice).group(1))
```

How to extract text from the different id from beautifulsoup

I want to extract from the id but every id has different value check it:
div',id='statement80863
div',id='statement26092
and so on ............................
CODE
import requests
from bs4 import BeautifulSoup
import re
limit = 100
url = f'https://www.counselingcalifornia.com/cc/cgi-bin/utilities.dll/customlist?FIRSTNAME=~&LASTNAME=~&ZIP=&DONORCLASSSTT=&_MULTIPLE_INSURANCE=&HASPHOTOFLG=&_MULTIPLE_EMPHASIS=&ETHNIC=&_MULTIPLE_LANGUAGE=ENG&QNAME=THERAPISTLIST&WMT=NONE&WNR=NONE&WHP=therapistHeader.htm&WBP=therapistList.htm&RANGE=1%2F{limit}&SORT=LASTNAME'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
rows = soup.find_all('div', {'class':'row'})
for row in rows:
des=row.find('div',id='statement80863').text
print(des)
You can use Regular Expressions to select only such <div> tags.
row.find('div', {'id': re.compile('^statement.*')}) - will select all the <div> tags that has an id which starts with the word statement.
import re
import requests
from bs4 import BeautifulSoup
url = 'https://www.counselingcalifornia.com/cc/cgi-bin/utilities.dll/customlist?FIRSTNAME=~&LASTNAME=~&ZIP=&DONORCLASSSTT=&_MULTIPLE_INSURANCE=&HASPHOTOFLG=&_MULTIPLE_EMPHASIS=&ETHNIC=&_MULTIPLE_LANGUAGE=ENG&QNAME=THERAPISTLIST&WMT=NONE&WNR=NONE&WHP=therapistHeader.htm&WBP=therapistList.htm&RANGE=1%2F100&SORT=LASTNAME'
headers = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
rows = soup.find_all('div', class_='row')
for row in rows:
d = row.find('div', {'id': re.compile('^statement*')})
if d:
# Your scraping code here...

Python BeautifulSoup - extract URLs and request pages and then retrieve abstracts

I want to access the E-journal page and then retrieve every abstract of the articles.
So I wrote the code that makes a list of the URLs of abstract pages. And it works successfully.
But when I tried to request the URLs and retrieve the abstracts, it didn't work. (with many 'None' in the console.)
This is my code.
import requests
from bs4 import BeautifulSoup
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
URL = "https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7"
JAGS_result = requests.get(URL, headers=h)
JAGS_soup = BeautifulSoup(JAGS_result.text, "html.parser")
L = []
for link in JAGS_soup.find_all('a',{"title":"Abstract"}):
L.append(link.get('href'))
Ab_Links = []
a = 0
for ab_link in L:
if a == len(L):
break
else:
full_link = "https://agsjournals.onlinelibrary.wiley.com"+L[a]
Ab_Links.append(full_link)
a = a+1
print(Ab_Links)
b = 0
Ab = []
Ab_URL = Ab_Links[b]
for ab_url in Ab_Links:
if b == len(L):
break
else:
Ab_result = requests.get(Ab_Links[b], headers = h)
Ab_soup = BeautifulSoup(Ab_result.text, "html.parser")
abstract = Ab_soup.find({"class" : "article-section article-section__abstract"})
Ab.append(abstract)
b = b+1
print(Ab)
I am a novice to python and HTML so it is very hard to write code by myself. Please help me...
import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url)
soup = BeautifulSoup(
r.content, 'lxml', parse_only=SoupStrainer('a', title='Abstract'))
links = [urljoin(url, x['href']) for x in soup.select('a')]
for link in links:
r = req.get(link)
soup = BeautifulSoup(r.text, 'lxml')
print(soup.select_one('.article-section.article-section__abstract'))
if __name__ == "__main__":
main('https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7')
You could try this out.
This prints the abstract of all the articles in the page.
import requests
import bs4 as bs
url = 'https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7'
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
resp = requests.get(url, headers=h)
soup = bs.BeautifulSoup(resp.text, 'lxml')
base_url = 'https://agsjournals.onlinelibrary.wiley.com'
abstract_urls = soup.findAll('a', attrs= {'title': 'Abstract'})
for i in abstract_urls:
a_url = base_url + i['href']
r = requests.get(a_url,headers=h)
soup = bs.BeautifulSoup(r.text, 'lxml')
abs_text = soup.find('section', class_='article-section article-section__full').text.strip()
print(abs_text)
Your code is mostly correct. The problem is with finding the abstract. In order to search for an element by class, use class_='...'. If you change your Abstract = line to the following, it will return results:
abstract = Ab_soup.find(class_='article-section article-section__abstract')
Also, you can simplify your loops. for ab_link in L will iterate through each item in L and then stop. You do not need to test if a == len(L), and in fact that code will never be True, because the loop will exit before a == len(L).

Why can't I scrape Amazon products by BeautifulSoup?

I am trying to scrape the heading of this Amazon listing. The code I wrote is working for some other Amazon listings, but not working for the url mentioned in the code below.
Here is the python code I've tried:
import requests
from bs4 import BeautifulSoup
url="https://www.amazon.in/BULLMER-Cotton-Printed-T-shirt-Multicolour/dp/B0892SZX7F/ref=sr_1_4?c=ts&dchild=1&keywords=Men%27s+T-Shirts&pf_rd_i=1968024031&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_p=8b97601b-3643-402d-866f-95cc6c9f08d4&pf_rd_r=EPY70Y57HP1220DK033Y&pf_rd_s=merchandised-search-6&qid=1596817115&refinements=p_72%3A1318477031&s=apparel&sr=1-4&ts_id=1968123031"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0"}
page = requests.get(url, headers=headers)
print(page.status_code)
soup = BeautifulSoup(page.content, "html.parser")
#print(soup.prettify())
title = soup.find(id = "productTitle")
if title:
title = title.get_text()
else:
title = "default_title"
print(title)
Output:
200
default_title
html code from inspector tools:
<span id="productTitle" class="a-size-large product-title-word-break">
BULLMER Mens Halfsleeve Round Neck Printed Cotton Tshirt - Combo Tshirt - Pack of 3
</span>
First, As others have commented, use a proxy service. Second in order to go amazon product page if you have an asin that's enough.
Amazon follows this url pattern for all product pages.
https://www.amazon.(com/in/fr)/dp/<asin>
import requests
from bs4 import BeautifulSoup
url="https://www.amazon.in/dp/B0892SZX7F"
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
page = requests.get(url, headers=headers)
print(page.status_code)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find("span", {"id":"productTitle"})
if title:
title = title.get_text(strip=True)
else:
title = "default_title"
print(title)
Output:
200
BULLMER Mens Halfsleeve Round Neck Printed Cotton Tshirt - Combo Tshirt - Pack of 3
this worked fine for me:
import requests
from bs4 import BeautifulSoup
url="https://www.amazon.in/BULLMER-Cotton-Printed-T-shirt-Multicolour/dp/B0892SZX7F/ref=sr_1_4?c=ts&dchild=1&keywords=Men%27s+T-Shirts&pf_rd_i=1968024031&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_p=8b97601b-3643-402d-866f-95cc6c9f08d4&pf_rd_r=EPY70Y57HP1220DK033Y&pf_rd_s=merchandised-search-6&qid=1596817115&refinements=p_72%3A1318477031&s=apparel&sr=1-4&ts_id=1968123031"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0"}
http_proxy = "http://10.10.1.10:3128"
https_proxy = "https://10.10.1.11:1080"
ftp_proxy = "ftp://10.10.1.10:3128"
proxyDict = {
"http" : http_proxy,
"https" : https_proxy,
"ftp" : ftp_proxy
}
page = requests.get(url, headers=headers)
print(page.status_code)
soup = BeautifulSoup(page.content, "lxml")
#print(soup.prettify())
title = soup.find(id = "productTitle")
if title:
title = title.get_text()
else:
title = "default_title"
print(title)

Categories