Parsing in python for all pages of the site section

Parsing in python for all pages of the site section - python

I'm making a python parser for the site: https://www.kinopoisk.ru/lists/series-top250/
import requests
from bs4 import BeautifulSoup
import csv
CSV = 'genres.csv'
URL = 'https://www.kinopoisk.ru/lists/series-top250/?page=1&tab=all'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0', 'accept': '*/*'}
def get_html(url, params = None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_='selection-film-item-meta selection-film-item-meta_theme_desktop')
genres = []
for item in items:
additional = item.find_all('span', {'class':'selection-film-item-meta__meta-additional-item'})
genres.append(
{
'genre': additional[1].get_text(strip = True)
}
)
return genres
def save_genres(items, path):
with open(path, 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(['genre'])
for item in items:
writer.writerow([item['genre']])
def parser():
html = get_html(URL)
if html.status_code == 200:
genres = []
for page in range(1, 6):
html = get_html(URL, params = {'page': page})
genres.extend(get_content(html.text))
save_genres(genres, CSV)
pass
else:
print('Non_available')
parser()
The section of site has 5 pages of rating:
https://www.kinopoisk.ru/lists/series-top250/?page=1&tab=all
...
https://www.kinopoisk.ru/lists/series-top250/?page=5&tab=all
I made a for_loop for parsing from all pages with changing number of page
for page in range(1, 6):
html = get_html(URL, params = {'page': page})
genres.extend(get_content(html.text))
but parsing occurs only on 1 page. Please tell me, what am I doing wrong?
And when I save the result in CSV, each line can contain more than 1 word (genre designation), I don’t know how to make sure that there is only 1 value on 1 line for aggregated analytics
Thank you!

Remove the parameters from the URL (the part after ? included):
import requests
from bs4 import BeautifulSoup
import csv
CSV = "genres.csv"
URL = "https://www.kinopoisk.ru/lists/series-top250/"
HEADERS = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0",
"accept": "*/*",
}
PARAMS = {"page": 1, "tab": "all"}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all(
"div",
class_="selection-film-item-meta selection-film-item-meta_theme_desktop",
)
genres = []
for item in items:
additional = item.find_all(
"span", {"class": "selection-film-item-meta__meta-additional-item"}
)
genres.append({"genre": additional[1].get_text(strip=True)})
return genres
def save_genres(items, path):
with open(path, "w", newline="") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["genre"])
for item in items:
writer.writerow([item["genre"]])
def parser():
genres = []
for page in range(1, 6):
print("Parsing page {}...".format(page))
PARAMS["page"] = page
html = get_html(URL, PARAMS)
if html.status_code == 200:
genres.extend(get_content(html.text))
else:
print("Non_available")
save_genres(genres, CSV)
parser()
Creates genres.csv:

Related

Problems with getting data from a page using python, beautiful soup

I am trying to explore the web scraping in python.Currently working with beautiful soup.I was trying to get names of the festivals from this site : https://www.skiddle.com/festivals .Everything was going pretty fine, except 1 page, this one: https://www.skiddle.com/festivals/front-end-data-test/. It says 'NoneType' object has no attribute 'find' any way i can get data from there?
Here is the code
import requests
from bs4 import BeautifulSoup
import lxml
import json
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64"
}
#collect all fests URLs
fests_urls_list = []
#for i in range(0, 120, 24):
for i in range(0, 24, 24):
url = f"https://www.skiddle.com/festivals/search/?ajaxing=1&sort=0&fest_name=&from_date=15%20Aug%202022&to_date=&maxprice=500&o={i}&bannertitle=August"
req = requests.get(url=url, headers=headers)
json_data = json.loads(req.text)
html_response = json_data["html"]
with open(f"data/index_{i}.html", "w", encoding="utf-8") as file:
file.write(html_response)
with open(f"data/index_{i}.html", "r", encoding="utf-8") as file:
src = file.read()
soup = BeautifulSoup(src, "lxml")
cards = soup.find_all("a", class_="card-details-link")
for item in cards:
fest_url = "https://www.skiddle.com" + item.get("href")
fests_urls_list.append(fest_url)
#collect fest info
for url in fests_urls_list:
req = requests.get(url=url, headers=headers)
try:
soup = BeautifulSoup(req.text, "lxml")
fest_name = soup.find("div", class_="MuiContainer-root MuiContainer-maxWidthFalse css-1krljt2").find("h1").text.strip()
fest_data = soup.find("div", class_="MuiGrid-root MuiGrid-item MuiGrid-grid-xs-11 css-twt0ol").text.strip()
print(fest_data)
except Exception as ex :
print(ex)
print("This was not supposed to happen")

The parser does not write to json. Writes only {}

I created a market parser for my own purposes, it works well overall!
Initially faced with a recording problem, gave a decode error. Now he did something and it disappeared, but now he does not want to parse the data into json, but simply writes 2 characters - {}
Here is main.py:
import json
import requests
from bs4 import BeautifulSoup
def get_first_news():
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36"
}
url = "https://funpay.ru/lots/700/"
r = requests.get(url=url, headers=headers)
soup = BeautifulSoup(r.text, "lxml")
articles_cards = soup.find_all("a", class_="tc-desc-text")
news_dict = {}
for article in articles_cards:
article_title = article.find("div", class_="tc-desc-text").text.strip()
article_desc = article.find("div", class_="tc-price").text.strip()
article_url = f'https://funpay.ru/lots/700/{article.get("href")}'
article_id = article_url.split("=")[-1]
# print(f"{article_title} | {article_url} | {article_date_timestamp}")
news_dict[article_id] = {
"article_title": article_title,
"article_url": article_url,
"article_desc": article_desc
}
with open("news_dict.json", "w") as file:
json.dump(news_dict, file, indent=4, ensure_ascii=False)
def main():
get_first_news()
if __name__ == '__main__':
main()
Here is test.py
# url = "https://www.securitylab.ru/news/520908.php"
#
# article_id = url.split("/")[-1]
# article_id = article_id[:-4]
# print(article_id)
import json
with open("news_dict.json") as file:
news_dict = json.load(file)
search_id = "520908123"
if search_id in news_dict:
print("Новость уже есть в словаре, пропускаем итерацию")
else:
print("Свежая новость, добавляем в словарь")
Here is news_dict.json:
{}

In article_cards = soup.find_all ("a", class _ = "tc-desc-text"),
we replace "a" with "div"
Here's what should come out:
article_cards = soup.find_all ("div", class _ = "tc-desc-text")

How do I capture the URL of each job so I can open full job description when looking at csv file

Can someone help me modify this script so that it also scraps the URL associated with each job. The purpose would be when browsing the .csv file in a spreadsheet I can click on the link if I would like to know more information about the job. Thank you in advance.
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
url= f'https://www.indeed.com/jobs?q=Dispensary&l=Denver%2C+CO&radius={page}'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('\n', '')
job = {
'title': title,
'company': company,
'salary': salary,
'summary': summary
}
joblist.append(job)
return
joblist = []
for i in range(0,90,10):
print(f'Getting page, {i}')
c = extract(0)
transform(c)
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')

You can use one of this
url = 'https://www.indeed.com' + item.find('a')['href']
url = 'https://www.indeed.com' + item.find('a').get('href')
url = 'https://www.indeed.com' + item.find('a').attrs['href']
url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
BTW:
You always load the same page. To get next page you have to use start=... in url.
And you can do this more readable using dictionary and params= in requests
payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': page,
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
Working code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(start):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': start,
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup, joblist):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
url = 'https://www.indeed.com' + item.find('a')['href']
#url = 'https://www.indeed.com' + item.find('a').get('href')
#url = 'https://www.indeed.com' + item.find('a').attrs['href']
#url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('\n', '')
joblist.append({
'title': title,
'url': url,
'company': company,
'salary': salary,
'summary': summary
})
# --- main ---
joblist = []
for start in range(0, 90, 10):
print('Getting page', start)
c = extract(start)
transform(c, joblist)
df = pd.DataFrame(joblist)
df.to_csv('jobs.csv')
print(df.head())

Unable to scrape the name from the inner page of each result using requests

I've created a script in python making use of post http requests to get the search results from a webpage. To populate the results, it is necessary to click on the fields sequentially shown here. Now a new page will be there and this is how to populate the result.
There are ten results in the first page and the following script can parse the results flawlessly.
What I wish to do now is use the results to reach their inner page in order to parse Sole Proprietorship Name (English) from there.
website address
I've tried so far with:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
payload = {
'QueryString': '0',
'SourceAppCode': 'cambodia-br-soleproprietorships',
'OriginalVersionIdentifier': '',
'_CBASYNCUPDATE_': 'true',
'_CBHTMLFRAG_': 'true',
'_CBNAME_': 'buttonPush'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = res.url.split("&")[0].replace("view.", "update.")
node = re.findall(r"nodeW\d.+?-Advanced",res.text)[0].strip()
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload[node] = 'N'
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[2]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.content, 'html.parser')
for item in soup.find_all("span", class_="appReceiveFocus")[3:]:
print(item.text)
How can I parse the Name (English) from each of the results inner page using requests?

This is one of the ways you can parse the name from the site's inner page and then email address from the address tab. I added this function .get_email() only because I wanted to let you know as to how you can parse content from different tabs.
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
result_url = "https://www.businessregistration.moc.gov.kh/cambodia-master/viewInstance/update.html?id={}"
base_url = "https://www.businessregistration.moc.gov.kh/cambodia-br-soleproprietorships/viewInstance/update.html?id={}"
def get_names(s):
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = result_url.format(res.url.split("id=")[1])
soup = BeautifulSoup(res.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['QueryString'] = 'a'
payload['SourceAppCode'] = 'cambodia-br-soleproprietorships'
payload['_CBNAME_'] = 'buttonPush'
payload['_CBHTMLFRAG_'] = 'true'
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[-1]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
payload.pop('_CBHTMLFRAGNODEID_')
payload.pop('_CBHTMLFRAG_')
payload.pop('_CBHTMLFRAGID_')
for item in soup.select("a[class*='ItemBox-resultLeft-viewMenu']"):
payload['_CBNAME_'] = 'invokeMenuCb'
payload['_CBVALUE_'] = ''
payload['_CBNODE_'] = item['id'].replace('node','')
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
address_url = base_url.format(res.url.split("id=")[1])
node_id = re.findall(r"taba(.*)_",soup.select_one("a[aria-label='Addresses']")['id'])[0]
payload['_CBNODE_'] = node_id
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNAME_'] = 'tabSelect'
payload['_CBVALUE_'] = '1'
eng_name = soup.select_one(".appCompanyName + .appAttrValue").get_text()
yield from get_email(s,eng_name,address_url,payload)
def get_email(s,eng_name,url,payload):
res = s.post(url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
email = soup.select_one(".EntityEmailAddresses:contains('Email') .appAttrValue").get_text()
yield eng_name,email
if __name__ == '__main__':
with requests.Session() as s:
for item in get_names(s):
print(item)
Output are like:
('AMY GEMS', 'amy.n.company#gmail.com')
('AHARATHAN LIN LIANJIN FOOD FLAVOR', 'skykoko344#gmail.com')
('AMETHYST DIAMOND KTV', 'twobrotherktv#gmail.com')

To get the Name (English) you can simply replace print(item.text) with print(item.text.split('/')[1].split('(')[0].strip()) which prints AMY GEMS

Incomprehensible parser behavior

Help me please! I programmed a simple parser, but it does not work correctly, and I do not know what this is connected with.
import requests
from bs4 import BeautifulSoup
URL = 'https://stopgame.ru//topgames'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0', 'accept': '*/*'}
HOST = 'https://stopgame.ru'
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('a', class_="lent-block game-block")
print(items)
def parse():
html = get_html(URL)
if html.status_code == 200:
items = get_content(html.text)
else:
print('Error')
parse()
I've got this output :
[]
Process finished with exit code 0

items = soup.find_all('a', class_="lent-block game-block")
You are trying to find out "lent-block game-block" class for anchor
tag which actually is not there in html and hence you are getting
blank list.
Try with this div item you will get the list of matched items.
items = soup.find_all('div', class_="lent-block lent-main")

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Parsing in python for all pages of the site section - python

Related

Problems with getting data from a page using python, beautiful soup

The parser does not write to json. Writes only {}

How do I capture the URL of each job so I can open full job description when looking at csv file

Unable to scrape the name from the inner page of each result using requests

Incomprehensible parser behavior

Categories

Resources