why am I still getting results as html using beautifulsoup? - python

I am writing a simple scraper for job postings, but my function extract_fulltext which is responsible for giving all the job description, however, I still got html tags as a response, so in this case. it is giving me raise InvalidSchema("No connection adapters were found for '%s'" % url) , and the full issue bug https://gist.github.com/SkyBulk/c6df488ef53ae6bc62c86670cfbd09ec
def extract_fulltext(url):
html = requests.get(url)
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html.text))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' + urllib.parse.quote(job_ids)
ajax_content = requests.get(ajax_url)
soup = BeautifulSoup(ajax_content.text,"lxml")
text = soup.getText()
return soup
response = requests.get(url, headers=self.headers)
data = response.text
soup = get_soup(data)
html = soup.find_all(
name="div", attrs={"class": "row"})
for page in html:
print(page)
prefix = ['30', 'monaten', 'meses', 'luni', 'mois', 'month', 'months', 'maanden',
'mesi', 'mies.', 'm\u00e5nader', '\u043c\u0435\u0441\u044f\u0446\u0435\u0432']
date_str = extract_date(page)
s_date = date_str.replace('+', '')
match = [prefix_match for prefix_match in prefix if prefix_match in s_date]
if len(match) > 0:
pass
elif "NOT_FOUND" in s_date:
pass
else:
self.data_extracted['jobs'].append({
'job_title': extract_job_title(page),
'company': extract_company(page),
'city': extract_location(page),
'date': extract_date(page),
'cleared': extract_fulltext(page),
'url': [self.urls[country] + extract_link(page)]
})
I expect the output of {"job_id": "description"} , but the actual output is an error

You can do your concept depending on this solution
import requests,json
from bs4 import BeautifulSoup
req = requests.get('https://www.indeed.com/rpc/jobdescs?jks=80635306093cf18a,7496998d9ee18bdc')
data = json.loads(req.text)
for id in data.keys():
soup = BeautifulSoup(data[id])
print(soup.text)
Demo : Here

Simply use .get_text():
def extract_fulltext(url):
html = requests.get(url)
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html.text))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' + urllib.parse.quote(job_ids)
ajax_content = requests.get(ajax_url)
soup = BeautifulSoup(ajax_content.text,"lxml")
text = soup.get_text()
return text

Related

How to get the tokens in data-search-meta-sol

def extract(page):
url = f'https://www.jobstreet.com.my/en/job-search/administrative-assistant-jobs/{page}/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
jobs = soup.find_all('div', class_='sx2jih0 zcydq876 zcydq866 zcydq896 zcydq886 zcydq8n zcydq856 zcydq8f6 zcydq8eu')
for job in jobs[:29]:
for token in job.find_all('div', attrs={'data-search-sol-meta': True}):
more_details = token.text.strip()
job_detail = {
'more details': more_details
}
joblist.append(job_detail)
joblist = []
dummy = 2
for i in range(0, dummy, 1):
c = extract(i + 1)
transform(c)
print(f'Progress Page: [{int(i) + 1}/{dummy}]')
time.sleep(4)
df = pd.DataFrame(joblist)
I want to scrape the tokens in those data-search-sol-meta tags, how to i get it?
<div data-search-sol-meta="{"searchRequestToken":"62781aeb-4a14-43c9-b985-8be617cc1107","token":"0~62781aeb-4a14-43c9-b985-8be617cc1107","jobId":"jobstreet-my-job-5011156","section":"MAIN","sectionRank":1,"jobAdType":"ORGANIC","tags":{"mordor__flights":"mordor_80","jobstreet:userGroup":"BB","jobstreet:s_vi":"[CS]v1|314CC40D0D655F39-400007A66AC825EB[CE]"}}">
the results in the pd (more_details column) that I've got is just "None"
I would use a more robust css selector list i.e. not the dynamic classes. Be high enough in the DOM to be able to select both the attributes you want and then the job info. You can extract the attribute with the tokens and use json library to list separately.
import requests, json
from bs4 import BeautifulSoup
def extract(page):
url = f"https://www.jobstreet.com.my/en/job-search/administrative-assistant-jobs/{page}/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
return soup
def transform(soup):
jobs = soup.select("[data-automation=jobListing] > div:has(article)")
for job in jobs:
print(job.select_one("h1 span").text)
print()
print(job["data-search-sol-meta"])
print()
data = json.loads(job["data-search-sol-meta"])
print("searchRequestToken: ", data["searchRequestToken"])
print("token: ", data["token"])
print()
soup = extract(1)
transform(soup)

Save dictionary in Excel with Python

I need your help to save the data in Excel. I`ve parsed some site and I need to input dictionary in Excel.
from scrapingbee import ScrapingBeeClient
import requests
from bs4 import BeautifulSoup
import pandas as pd
SCRAPINGBEE_API_KEY = "bzzzz"
endpoint = "https://app.scrapingbee.com/api/v1"
pages = [
'https://www.businesslist.com.ng/category/restaurants/1/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/2/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/3/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/4/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/5/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/6/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/7/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/8/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/9/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/10/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/11/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/12/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/13/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/14/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/15/city:lagos'
]
rest = []
#GET_LINKS
for url in pages[:1]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all('h4')
for items in body:
item = items.find('a').get('href')
item_link = 'https://www.businesslist.com.ng' + item
rest.append(item_link)
#GET_REST
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
df = pd.DataFrame(info)
df.to_excel('./Lagos.xlsx')
I get the link to parse from list 'rest', then get data from this link. Then I want to save each item from all link to dictionary 'info'. Then save it to Excel file. But code is saving the one line to file, not the all. I`ve missed something.
You are saving df inside the loop with same name it will create only one dict(means each loop value in excel). so you better create a empty dataframe outside the loop and save it into excel file after the loop execution completed.
Your altered code will be like
all_info = pd.DataFrame()
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
if len(all_info) == 0:
all_info = pd.DataFrame(info, index=range(len(info)))
else:
all_info = all_info.append(pd.DataFrame(info))
all_info.to_excel('./Lagos.xlsx')
How about creating a list with all the data, then converting that to a dataframe and then outputting that to an Excel file.
from scrapingbee import ScrapingBeeClient
import requests
from bs4 import BeautifulSoup
import pandas as pd
SCRAPINGBEE_API_KEY = "zzzzzzzzz"
endpoint = "https://app.scrapingbee.com/api/v1"
pages = [
'https://www.businesslist.com.ng/category/restaurants/1/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/2/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/3/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/4/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/5/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/6/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/7/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/8/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/9/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/10/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/11/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/12/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/13/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/14/city:lagos',
'https://www.businesslist.com.ng/category/restaurants/15/city:lagos'
]
rest = []
#GET_LINKS
for url in pages[:1]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all('h4')
for items in body:
item = items.find('a').get('href')
item_link = 'https://www.businesslist.com.ng' + item
rest.append(item_link)
#GET_REST
data = []
for url in rest[:2]:
params = {
'api_key': SCRAPINGBEE_API_KEY,
'url': url}
info = {}
response = requests.get(endpoint, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
restaraunt_name = soup.find('b', {'id':'company_name'}).text
info.update({'Restaraunt':restaraunt_name})
location = soup.find('div', {'class':'text location'}).text.split('View Map')[0]
info.update({'Location':location})
phone = soup.find('div', {'class':'text phone'}).text[:11]
info.update({'Phone':phone})
web = soup.find('div', {'class':'text weblinks'}).text
info.update({'web':web})
data.append(info)
df = pd.DataFrame(data)
df.to_excel('./Lagos.xlsx')

Newbie: Python "AttributeError: 'NoneType' object has no attribute 'text' " when scraping Tripadvisor Reviews

I am trying to scrape some Tripadvisor reviews as a complete newbie to this.
I'm using code from Susanli2016.
It worked (though, removing the attribute "language") for one link but it doesn't work for any more link (for example.)
I'm receiving the error:
> Traceback (most recent call last):
> File "<pyshell#27>", line 4, in <module>
> items = scrape(url)
> File "<pyshell#12>", line 11, in scrape
> items = parse(session, url + '?filterLang=' + lang)
> File "<pyshell#15>", line 12, in parse
> num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
> AttributeError: 'NoneType' object has no attribute 'text'
I'm attaching the code here with the changes I made in case someone can help me.
Thank you so much!
Silvia
--
I substituted the original:
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
with
num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
With the original code I get the error
ValueError: invalid literal for int() with base 10: '5.695'
(where 5.695 is the number of reviews in the page)
--
Hereby the complete code:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', id='HEADING').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
lang = 'it'
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')
I realized the problem lies in the source code.
hotel_name = soup.find('h1', id='HEADING').text
found no target id in the source website. I substituted it with:
hotel_name = soup.find('h1', class_='heading').text
I hope it can help others!

How to scrape whole website using beautifulsoup

I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)
Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)

How to scrape the web table with multiple pages using R or Python

I want to scrape a web to gather the data for studying data mining. This web data contains a big table with 43 pages. And it also hide some stocks at the most right hand side of the expand menu.
The web page is below.
http://data.10jqka.com.cn/market/longhu/yyb/
import bs4
import requests
url = r"http://data.10jqka.com.cn/market/longhu/yyb/"
response = requests.get(url)
if response.status_code == 200:
content = response.content
soup = bs4.BeautifulSoup(content)
table_results = soup.findAll("table", {"class": "m_table"})
for item in table_results:
company_name = item.findAll("td", {"class": "tl"})[0].text.strip()
detail = item.findAll("td", {"class": "tc"})[0].text.strip()
c_rise = item.findAll("td", {"class": "c_rise"})[0].text.strip()
c_fall = item.findAll("td", {"class": "c_fall"})[0].text.strip()
cur = item.findAll("td", {"class": "cur"})[0].text.strip()
lhb_stocklist = item.findAll("div", {"class": "lhb_stocklist"})[0].text.strip()
print company_name, detail, c_rise, c_fall, lhb_stocklist
A solution based on requests, BeautifulSoup, and lxml:
import json
import requests
from bs4 import BeautifulSoup
URL = 'http://data.10jqka.com.cn/interface/market/longhuyyb/stocknum/desc/%d/20'
# config end_page as needed, or parse http://data.10jqka.com.cn/market/longhu/yyb/ to make it auto adapted
end_page = 2
result = []
for page_idx in range(1, end_page + 1):
print 'Extracting page', page_idx
raw_response = requests.get(URL % page_idx)
page_content = json.loads(raw_response.text)['data']
html = BeautifulSoup(page_content, 'lxml')
for row in html.tbody.find_all('tr'):
company = row.find(class_='tl').text
detail_link = row.find(class_='tl').a['href']
buy = float(row.find(class_='c_rise').text)
sell = float(row.find(class_='c_fall').text)
stock_cnt = int(row.find(class_='cur').text)
stocks = []
for a in row.find(class_='lhb_stocklist_box hide').p.find_all('a'):
stocks.append((a.text, a['href']))
result.append({
'company': company,
'detail_link': detail_link,
'buy': buy,
'sell': sell,
'stock_cnt': stock_cnt,
'stocks': stocks,
})
print 'Company number:', len(result)
I put all data into a list of dictionaries, for easy accessing. You can modify the codes to directly write to a CSV or whatever

Categories