Incomprehensible parser behavior

Incomprehensible parser behavior - python

Help me please! I programmed a simple parser, but it does not work correctly, and I do not know what this is connected with.
import requests
from bs4 import BeautifulSoup
URL = 'https://stopgame.ru//topgames'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0', 'accept': '*/*'}
HOST = 'https://stopgame.ru'
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('a', class_="lent-block game-block")
print(items)
def parse():
html = get_html(URL)
if html.status_code == 200:
items = get_content(html.text)
else:
print('Error')
parse()
I've got this output :
[]
Process finished with exit code 0

items = soup.find_all('a', class_="lent-block game-block")
You are trying to find out "lent-block game-block" class for anchor
tag which actually is not there in html and hence you are getting
blank list.
Try with this div item you will get the list of matched items.
items = soup.find_all('div', class_="lent-block lent-main")

Related

How to change the code to asynchronously iterate links and IDs for scrap web page?

I have the list of links, each link has an id that is in the Id list
How to change the code so that when iterating the link, the corresponding id is substituted into the string:
All code is below:
import pandas as pd
from bs4 import BeautifulSoup
import requests
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/81.0.4044.138 Safari/537.36 OPR/68.0.3618.125', 'accept': '*/*'}
links = ['https://www..ie', 'https://www..ch', 'https://www..com']
Id = ['164240372761e5178f0488d', '164240372661e5178e1b377', '164240365661e517481a1e6']
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
def get_data_no_products(html):
data = []
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', id= '') # How to iteration paste id???????
for item in items:
data.append({'pn': item.find('a').get('href')})
return print(data)
def parse():
for i in links:
html = get_html(i)
get_data_no_products(html.text)
parse()

Parametrise your code:
def get_data_no_products(html, id_):
data = []
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', id=id_)
And then use zip():
for link, id_ in zip(links, ids):
get_data_no_producs(link, id_)
Note that there's a likely bug in your code: you return print(data) which will always be none. You likely just want to return data.
PS
There is another solution to this which you will frequently encounter from people beginning in python:
for i in range(len(links)):
link = links[i]
id_ = ids[i]
...
This... works. It might even be easier or more natural, if you are coming from e.g. C. (Then again I'd likely use pointers...). Style is very much personal, but if you're going to write in a high level language like python you might as well avoid thinking about things like 'the index of the current item' as much as possible. Just my £0.02.

Python BeautifulSoup - extract URLs and request pages and then retrieve abstracts

I want to access the E-journal page and then retrieve every abstract of the articles.
So I wrote the code that makes a list of the URLs of abstract pages. And it works successfully.
But when I tried to request the URLs and retrieve the abstracts, it didn't work. (with many 'None' in the console.)
This is my code.
import requests
from bs4 import BeautifulSoup
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
URL = "https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7"
JAGS_result = requests.get(URL, headers=h)
JAGS_soup = BeautifulSoup(JAGS_result.text, "html.parser")
L = []
for link in JAGS_soup.find_all('a',{"title":"Abstract"}):
L.append(link.get('href'))
Ab_Links = []
a = 0
for ab_link in L:
if a == len(L):
break
else:
full_link = "https://agsjournals.onlinelibrary.wiley.com"+L[a]
Ab_Links.append(full_link)
a = a+1
print(Ab_Links)
b = 0
Ab = []
Ab_URL = Ab_Links[b]
for ab_url in Ab_Links:
if b == len(L):
break
else:
Ab_result = requests.get(Ab_Links[b], headers = h)
Ab_soup = BeautifulSoup(Ab_result.text, "html.parser")
abstract = Ab_soup.find({"class" : "article-section article-section__abstract"})
Ab.append(abstract)
b = b+1
print(Ab)
I am a novice to python and HTML so it is very hard to write code by myself. Please help me...

import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url)
soup = BeautifulSoup(
r.content, 'lxml', parse_only=SoupStrainer('a', title='Abstract'))
links = [urljoin(url, x['href']) for x in soup.select('a')]
for link in links:
r = req.get(link)
soup = BeautifulSoup(r.text, 'lxml')
print(soup.select_one('.article-section.article-section__abstract'))
if __name__ == "__main__":
main('https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7')

You could try this out.
This prints the abstract of all the articles in the page.
import requests
import bs4 as bs
url = 'https://agsjournals.onlinelibrary.wiley.com/toc/15325415/2021/69/7'
h = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
resp = requests.get(url, headers=h)
soup = bs.BeautifulSoup(resp.text, 'lxml')
base_url = 'https://agsjournals.onlinelibrary.wiley.com'
abstract_urls = soup.findAll('a', attrs= {'title': 'Abstract'})
for i in abstract_urls:
a_url = base_url + i['href']
r = requests.get(a_url,headers=h)
soup = bs.BeautifulSoup(r.text, 'lxml')
abs_text = soup.find('section', class_='article-section article-section__full').text.strip()
print(abs_text)

Your code is mostly correct. The problem is with finding the abstract. In order to search for an element by class, use class_='...'. If you change your Abstract = line to the following, it will return results:
abstract = Ab_soup.find(class_='article-section article-section__abstract')
Also, you can simplify your loops. for ab_link in L will iterate through each item in L and then stop. You do not need to test if a == len(L), and in fact that code will never be True, because the loop will exit before a == len(L).

how to get the content of a title using BeautifulSoup4 and requests

So i have taken the title of the medicines from this link : Medicines List
now i want to get the content for every medicines meanwhile every medicines has it owns link
Example :
Medicines Example
how can I get the content of that medicines using BeautifulSoup4 and requests library?
import requests
from bs4 import BeautifulSoup
from pprint import pp
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
title = [x.text for x in soup.select(
'a[class$=section__item-link]')]
count = 0
for x in range (0, len(title)):
count += 1
print("{0}. {1}\n".format(count, title[x]))
main('https://www.klikdokter.com/obat')

Based on what I can see as the response from https://www.klikdokter.com/obat you should be able to do something like this:-
import requests
from bs4 import BeautifulSoup
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
BASEURL = 'https://www.klikdokter.com/obat'
headers = {'User-Agent': AGENT}
response = requests.get(BASEURL, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for tag in soup.find_all('a', class_='topics-index--section__item-link'):
href = tag.get('href')
if href is not None:
print(href)
response = requests.get(href, headers=headers)
response.raise_for_status()
""" Do your processing here """

Unable to scrape the name from the inner page of each result using requests

I've created a script in python making use of post http requests to get the search results from a webpage. To populate the results, it is necessary to click on the fields sequentially shown here. Now a new page will be there and this is how to populate the result.
There are ten results in the first page and the following script can parse the results flawlessly.
What I wish to do now is use the results to reach their inner page in order to parse Sole Proprietorship Name (English) from there.
website address
I've tried so far with:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
payload = {
'QueryString': '0',
'SourceAppCode': 'cambodia-br-soleproprietorships',
'OriginalVersionIdentifier': '',
'_CBASYNCUPDATE_': 'true',
'_CBHTMLFRAG_': 'true',
'_CBNAME_': 'buttonPush'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = res.url.split("&")[0].replace("view.", "update.")
node = re.findall(r"nodeW\d.+?-Advanced",res.text)[0].strip()
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload[node] = 'N'
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[2]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.content, 'html.parser')
for item in soup.find_all("span", class_="appReceiveFocus")[3:]:
print(item.text)
How can I parse the Name (English) from each of the results inner page using requests?

This is one of the ways you can parse the name from the site's inner page and then email address from the address tab. I added this function .get_email() only because I wanted to let you know as to how you can parse content from different tabs.
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
result_url = "https://www.businessregistration.moc.gov.kh/cambodia-master/viewInstance/update.html?id={}"
base_url = "https://www.businessregistration.moc.gov.kh/cambodia-br-soleproprietorships/viewInstance/update.html?id={}"
def get_names(s):
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = result_url.format(res.url.split("id=")[1])
soup = BeautifulSoup(res.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['QueryString'] = 'a'
payload['SourceAppCode'] = 'cambodia-br-soleproprietorships'
payload['_CBNAME_'] = 'buttonPush'
payload['_CBHTMLFRAG_'] = 'true'
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[-1]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
payload.pop('_CBHTMLFRAGNODEID_')
payload.pop('_CBHTMLFRAG_')
payload.pop('_CBHTMLFRAGID_')
for item in soup.select("a[class*='ItemBox-resultLeft-viewMenu']"):
payload['_CBNAME_'] = 'invokeMenuCb'
payload['_CBVALUE_'] = ''
payload['_CBNODE_'] = item['id'].replace('node','')
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
address_url = base_url.format(res.url.split("id=")[1])
node_id = re.findall(r"taba(.*)_",soup.select_one("a[aria-label='Addresses']")['id'])[0]
payload['_CBNODE_'] = node_id
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNAME_'] = 'tabSelect'
payload['_CBVALUE_'] = '1'
eng_name = soup.select_one(".appCompanyName + .appAttrValue").get_text()
yield from get_email(s,eng_name,address_url,payload)
def get_email(s,eng_name,url,payload):
res = s.post(url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
email = soup.select_one(".EntityEmailAddresses:contains('Email') .appAttrValue").get_text()
yield eng_name,email
if __name__ == '__main__':
with requests.Session() as s:
for item in get_names(s):
print(item)
Output are like:
('AMY GEMS', 'amy.n.company#gmail.com')
('AHARATHAN LIN LIANJIN FOOD FLAVOR', 'skykoko344#gmail.com')
('AMETHYST DIAMOND KTV', 'twobrotherktv#gmail.com')

To get the Name (English) you can simply replace print(item.text) with print(item.text.split('/')[1].split('(')[0].strip()) which prints AMY GEMS

Remove the "uah"

I wrote a parser that should parse exchange rates but there is a final touch.
Code:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.google.com/search?sxsrf=ALeKk02hYi-HCGXbHdPuek-VJRu_8qsUVg%3A1587054998453&ei=lomYXvaSG7zAmwWP_LHQBA&q=%D0%B4%D0%BE%D0%BB%D0%BB%D0%B0%D1%80+%D0%B3%D1%80%D0%B8%D0%B2%D0%BD%D0%B0&oq=&gs_lcp=CgZwc3ktYWIQARgBMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnOgQIABBHSgkIFxIFMTAtMjRKCAgYEgQxMC0yUPFtWPFtYKt8aAFwAngAgAEAiAEAkgEAmAEAoAEBqgEHZ3dzLXdperABCg&sclient=psy-ab'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'accept': '*/*'}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="VgAgW")
currency = []
for item in items:
currency.append({
'uah': item.find('span', class_='SwHCTb').get_text(strip=True),
})
print(f"'Now the course:' + {currency}")
return currency
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print('Error')
parse()
I don’t know how to remove this: [{'uah':}]
Here is what comes out:
'Now the course:' + [{'uah': '27,22'}]
Process finished with exit code 0

Currency is a list currency = [] so when you print list it's always prints like this [].
Currency is a list of dicts {'uah': ...} so when you print dict it's always prints like this {key: value}.
Looks like you need to print(f"Now the course: {currency[0]['uah']}") where [0] is the first element of list, which is dict, and then gets value of that first dict by it's key 'uah'.
You can add an additional variable course to make it easier to access the value:
course = item.find('span', class_='SwHCTb').get_text(strip=True)
currency.append({'uah': course})
print(f"Now the course: {course}")

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Incomprehensible parser behavior - python

Related

How to change the code to asynchronously iterate links and IDs for scrap web page?

Python BeautifulSoup - extract URLs and request pages and then retrieve abstracts

how to get the content of a title using BeautifulSoup4 and requests

Unable to scrape the name from the inner page of each result using requests

Remove the "uah"

Categories

Resources