Can't fetch all the titles from a webpage - python

I'm trying to parse all the categories and their nested categories recursivelly from this webpage which ultimately leads to such page and finally this innermost page from where I would like to fetch all the product titles.
The script can follow the above steps. However, when it comes to fetch all the titles from result pages traversing all next pages, the script gets fewer content than how many there are.
This is what I've written:
class mySpider(scrapy.Spider):
name = "myspider"
start_urls = ['https://www.phoenixcontact.com/online/portal/gb?1dmy&urile=wcm%3apath%3a/gben/web/main/products/subcategory_pages/Cables_P-10/e3a9792d-bafa-4e89-8e3f-8b1a45bd2682']
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
def parse(self,response):
cookie = response.headers.getlist('Set-Cookie')[1].decode().split(";")[0]
for item in response.xpath("//div[./h3[contains(.,'Category')]]/ul/li/a/#href").getall():
item_link = response.urljoin(item.strip())
if "/products/list_pages/" in item_link:
yield scrapy.Request(item_link,headers=self.headers,meta={'cookiejar': cookie},callback=self.parse_all_links)
else:
yield scrapy.Request(item_link,headers=self.headers,meta={'cookiejar': cookie},callback=self.parse)
def parse_all_links(self,response):
for item in response.css("[class='pxc-sales-data-wrp'][data-product-key] h3 > a[href][onclick]::attr(href)").getall():
target_link = response.urljoin(item.strip())
yield scrapy.Request(target_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_main_content)
next_page = response.css("a.pxc-pager-next::attr(href)").get()
if next_page:
base_url = response.css("base::attr(href)").get()
next_page_link = urljoin(base_url,next_page)
yield scrapy.Request(next_page_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_all_links)
def parse_main_content(self,response):
item = response.css("h1::text").get()
print(item)
How can I get all the titles available in that category?
The script gets different number of results every time I run it.

Your main issue is that you need to use separate cookiejar for each "/products/list_pages/" to get next page correctly. I used a class variable cookie for this (see my code) and got same result (4293 items) several times.
Here is my code (I don't download product page (just read product title from a list of products):
class mySpider(scrapy.Spider):
name = "phoenixcontact"
start_urls = ['https://www.phoenixcontact.com/online/portal/gb?1dmy&urile=wcm%3apath%3a/gben/web/main/products/subcategory_pages/Cables_P-10/e3a9792d-bafa-4e89-8e3f-8b1a45bd2682']
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
cookie = 1
def parse(self,response):
# cookie = response.headers.getlist('Set-Cookie')[1].decode().split(";")[0]
for item in response.xpath("//div[./h3[contains(.,'Category')]]/ul/li/a/#href").getall():
item_link = response.urljoin(item.strip())
if "/products/list_pages/" in item_link:
cookie = self.cookie
self.cookie += 1
yield scrapy.Request(item_link,headers=self.headers,meta={'cookiejar': cookie},callback=self.parse_all_links, cb_kwargs={'page_number': 1})
else:
yield scrapy.Request(item_link,headers=self.headers,callback=self.parse)
def parse_all_links(self,response, page_number):
# if page_number > 1:
# with open("Samples/Page.htm", "wb") as f:
# f.write(response.body)
# for item in response.css("[class='pxc-sales-data-wrp'][data-product-key] h3 > a[href][onclick]::attr(href)").getall():
for item in response.xpath('//div[#data-product-key]//h3//a'):
target_link = response.urljoin(item.xpath('./#href').get())
item_title = item.xpath('./text()').get()
yield {'title': item_title}
# yield scrapy.Request(target_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_main_content)
next_page = response.css("a.pxc-pager-next::attr(href)").get()
if next_page:
base_url = response.css("base::attr(href)").get()
next_page_link = response.urljoin(next_page)
yield scrapy.Request(next_page_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_all_links, cb_kwargs={'page_number': page_number + 1})

Related

How do I get this code to loop for other stocks? For example, I want it to repeat and show stocks like Telsa, Amazon, Apple all in one executution?

How do I get this code to loop for other stocks? For example, I want it to repeat and show stocks like Telsa, Amazon, Apple all in one executution? In my code, it only shows one stock and I want it to display multiple stocks.
Code:
import requests
from bs4 import BeautifulSoup
def create_url():
url = f'https://finance.yahoo.com/quote/TSLA'
return url
def get_html(url):
header = {"User Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
response = requests.get(url, headers = header)
if response.status_code == 200:
return response.text
else:
return None
def parse_data(html):
soup = BeautifulSoup(html,'html.parser')
name = soup.find('h1', {'class': 'D(ib) Fz(18px)'}).text
price = soup.select_one('#quote-header-info > div.My(6px).Pos(r).smartphone_Mt(6px).W(100%) > div.D(ib).Va(m).Maw(65%).Ov(h) > div.D(ib).Mend(20px) > fin-streamer.Fw(b).Fz(36px).Mb(-4px).D(ib)').text
stock_data = {
'name':name,
'price':price,
}
return stock_data
def main():
url = create_url()
# get html
html = get_html(url)
data = parse_data(html)
#return data
print(data)
if __name__ == '__main__':
main()
Try changing your create_url to take one parameter, which will be the stock you want to query, like so:
def create_url(ticker):
url = 'https://finance.yahoo.com/quote/' + ticker
return url
Then, you can create a list of tickers in your main function and call the function for each ticker.
def main():
tickers = [“AAPL”, “TSLA”]
for ticker in tickers:
url = create_url(ticker)
# get html
html = get_html(url)
data = parse_data(html)
print(data)

How to change the code to asynchronously iterate links and IDs for scrap web page?

I have the list of links, each link has an id that is in the Id list
How to change the code so that when iterating the link, the corresponding id is substituted into the string:
All code is below:
import pandas as pd
from bs4 import BeautifulSoup
import requests
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/81.0.4044.138 Safari/537.36 OPR/68.0.3618.125', 'accept': '*/*'}
links = ['https://www..ie', 'https://www..ch', 'https://www..com']
Id = ['164240372761e5178f0488d', '164240372661e5178e1b377', '164240365661e517481a1e6']
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
def get_data_no_products(html):
data = []
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', id= '') # How to iteration paste id???????
for item in items:
data.append({'pn': item.find('a').get('href')})
return print(data)
def parse():
for i in links:
html = get_html(i)
get_data_no_products(html.text)
parse()
Parametrise your code:
def get_data_no_products(html, id_):
data = []
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', id=id_)
And then use zip():
for link, id_ in zip(links, ids):
get_data_no_producs(link, id_)
Note that there's a likely bug in your code: you return print(data) which will always be none. You likely just want to return data.
PS
There is another solution to this which you will frequently encounter from people beginning in python:
for i in range(len(links)):
link = links[i]
id_ = ids[i]
...
This... works. It might even be easier or more natural, if you are coming from e.g. C. (Then again I'd likely use pointers...). Style is very much personal, but if you're going to write in a high level language like python you might as well avoid thinking about things like 'the index of the current item' as much as possible. Just my £0.02.

Handling redirecting <301> from Indeed with Scrapy

I'm building a person scraper for Indeed primarily to practice on - I've set it up so that I extract details per 100 results in each page. By using the search query, I have a seed-list of cities and types of jobs looped within an f-string of the indeed url. I have these results stored as a dictionary, so that I can get the degree types as a column when these results are read into pandas.
My issue is that I keep getting Redirecting (301), I suppose that's because not all the links fulfil the requirement of a salary. Alternatively, I have included meta={'handle_httpstatus_list': [301]} but then I get no results regardless.
Here's my scraper:
class IndeedItem(scrapy.Item):
job_title = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
category = Field(output_processor = TakeFirst())
company = Field(output_processor = TakeFirst())
class IndeedSpider(scrapy.Spider):
name = 'indeed'
max_results_per_city = 1000
#names = pd.read_csv("indeed_names.csv")
#degree = pd.read_csv("degree_names2.csv",encoding='unicode_escape')
names = pd.DataFrame({'names':['London', 'Manchester']})
degree = pd.DataFrame({'degrees':['degree+Finance+£25','degree+Engineering+£25'], 'degree_type':['Finance', 'Engineering']})
start_urls = defaultdict(list)
for city in names.names:
for qualification,name in zip(degree.degrees, degree.degree_type):
start_urls[name].append(f'https://uk.indeed.com/jobs?q={qualification}%2C000&l={city}&fromage=7&filter=0&limit=100')
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':2
}
def start_requests(self):
for category, url in self.start_urls.items():
for link in url:
yield scrapy.Request(
link,
callback = self.parse,
#meta={'handle_httpstatus_list': [301]},
cb_kwargs = {
'page_count':0,
'category':category
}
)
def parse(self, response, page_count, category):
if page_count > 30:
return
indeed = response.xpath('//div[#id="mosaic-zone-jobcards"]//div')
for jobs in indeed:
loader = ItemLoader(IndeedItem(), selector = jobs)
loader.add_value('category', category)
loader.add_xpath('job_title', './/h2[#class="jobTitle jobTitle-color-purple jobTitle-newJob"]/span//text()')
loader.add_xpath('salary', './/div[#class="salary-snippet"]/span//text()')
loader.add_xpath('company', './/a/div[#class="slider_container"]/div[#class="slider_list"]/div[#class="slider_item"]/div[#class="job_seen_beacon"]/table[#class="jobCard_mainContent"]/tbody/tr/td[#class="resultContent"]/div[#class="heading6 company_location tapItem-gutter"]/pre/span[#class="companyName"]//text()')
yield loader.load_item
next_page = response.xpath('//ul[#class="pagination-list"]/li[5]/a//#href').get()
page_count += 1
if next_page is not None:
yield response.follow(
next_page,
callback = self.parse,
cb_kwargs = {
'page_count': page_count,
'category': category
}
)
I didn't had any 301 status, but the start_urls gave me problems and your xpath was off
This fix the xpath:
import scrapy
from pandas._libs.internals import defaultdict
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
import pandas as pd
class IndeedItem(scrapy.Item):
job_title = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
category = Field(output_processor=TakeFirst())
company = Field(output_processor=TakeFirst())
class IndeedSpider(scrapy.Spider):
name = 'indeed'
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY': 2
}
max_results_per_city = 1000
# names = pd.read_csv("indeed_names.csv")
# degree = pd.read_csv("degree_names2.csv",encoding='unicode_escape')
names = pd.DataFrame({'names': ['London', 'Manchester']})
degree = pd.DataFrame({'degrees': ['degree+Finance+£25,000', 'degree+Engineering+£25,000'], 'degree_type': ['Finance', 'Engineering']})
start_urls = defaultdict(list)
def start_requests(self):
for city in self.names.names:
for qualification, name in zip(self.degree.degrees, self.degree.degree_type):
self.start_urls[name].append(f'https://uk.indeed.com/jobs?q={qualification}&l={city}&fromage=7&filter=0&limit=100')
for category, url in self.start_urls.items():
for link in url:
yield scrapy.Request(
link,
callback=self.parse,
#meta={'handle_httpstatus_list': [301]},
cb_kwargs={
'page_count': 0,
'category': category
}
)
def parse(self, response, page_count, category):
if page_count > 30:
return
indeed = response.xpath('//div[#class="slider_container"]')
for jobs in indeed:
loader = ItemLoader(IndeedItem(), selector=jobs)
loader.add_value('category', category)
loader.add_xpath('job_title', './/span[#title]//text()')
loader.add_xpath('salary', './/div[#class="salary-snippet"]/span//text()')
loader.add_xpath('company', './/span[#class="companyName"]//text()')
yield loader.load_item()
next_page = response.xpath('//ul[#class="pagination-list"]//li[last()]/a/#href').get()
page_count += 1
if next_page:
yield response.follow(
next_page,
callback=self.parse,
cb_kwargs={
'page_count': page_count,
'category': category
}
)
If you can give an example for a url that redirects I can try to help you.

Remove the "uah"

I wrote a parser that should parse exchange rates but there is a final touch.
Code:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.google.com/search?sxsrf=ALeKk02hYi-HCGXbHdPuek-VJRu_8qsUVg%3A1587054998453&ei=lomYXvaSG7zAmwWP_LHQBA&q=%D0%B4%D0%BE%D0%BB%D0%BB%D0%B0%D1%80+%D0%B3%D1%80%D0%B8%D0%B2%D0%BD%D0%B0&oq=&gs_lcp=CgZwc3ktYWIQARgBMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnOgQIABBHSgkIFxIFMTAtMjRKCAgYEgQxMC0yUPFtWPFtYKt8aAFwAngAgAEAiAEAkgEAmAEAoAEBqgEHZ3dzLXdperABCg&sclient=psy-ab'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'accept': '*/*'}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="VgAgW")
currency = []
for item in items:
currency.append({
'uah': item.find('span', class_='SwHCTb').get_text(strip=True),
})
print(f"'Now the course:' + {currency}")
return currency
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print('Error')
parse()
I don’t know how to remove this: [{'uah':}]
Here is what comes out:
'Now the course:' + [{'uah': '27,22'}]
Process finished with exit code 0
Currency is a list currency = [] so when you print list it's always prints like this [].
Currency is a list of dicts {'uah': ...} so when you print dict it's always prints like this {key: value}.
Looks like you need to print(f"Now the course: {currency[0]['uah']}") where [0] is the first element of list, which is dict, and then gets value of that first dict by it's key 'uah'.
You can add an additional variable course to make it easier to access the value:
course = item.find('span', class_='SwHCTb').get_text(strip=True)
currency.append({'uah': course})
print(f"Now the course: {course}")

Recursively parse all category links and get all products

I've been playing around with web-scraping (for this practice exercise using Python 3.6.2) and I feel like I'm loosing it a bit. Given this example link, here's what I want to do:
First, as you can see, there are multiple categories on the page. Clicking each of the categories from above will give me other categories, then other ones, an so on, until I reach the products page. So I have to go in depth x number of times. I thought recursion will help me achieve this, but somewhere I did something wrong.
Code:
Here, I'll explain the way I approached the problem. First, I created a session and a simple generic function which will return a lxml.html.HtmlElement object:
from lxml import html
from requests import Session
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/62.0.3202.94 Safari/537.36"
}
TEST_LINK = 'https://www.richelieu.com/us/en/category/custom-made-cabinet-doors-and-drawers/1000128'
session_ = Session()
def get_page(url):
page = session_.get(url, headers=HEADERS).text
return html.fromstring(page)
Then, I thought I'll need two other functions:
one to get the category links
and another one to get the product links
To distinguish between one and another, I figured out that only on category pages, there's a title which contains CATEGORIES every time, so I used that:
def read_categories(page):
categs = []
try:
if 'CATEGORIES' in page.xpath('//div[#class="boxData"][2]/h2')[0].text.strip():
for a in page.xpath('//*[#id="carouselSegment2b"]//li//a'):
categs.append(a.attrib["href"])
return categs
else:
return None
except Exception:
return None
def read_products(page):
return [
a_tag.attrib["href"]
for a_tag in page.xpath("//ul[#id='prodResult']/li//div[#class='imgWrapper']/a")
]
Now, the only thing left, is the recursion part, where I'm sure I did something wrong:
def read_all_categories(page):
cat = read_categories(page)
if not cat:
yield read_products(page)
else:
yield from read_all_categories(page)
def main():
main_page = get_page(TEST_LINK)
for links in read_all_categories(main_page):
print(links)
Here's all the code put together:
from lxml import html
from requests import Session
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/62.0.3202.94 Safari/537.36"
}
TEST_LINK = 'https://www.richelieu.com/us/en/category/custom-made-cabinet-doors-and-drawers/1000128'
session_ = Session()
def get_page(url):
page = session_.get(url, headers=HEADERS).text
return html.fromstring(page)
def read_categories(page):
categs = []
try:
if 'CATEGORIES' in page.xpath('//div[#class="boxData"][2]/h2')[0].text.strip():
for a in page.xpath('//*[#id="carouselSegment2b"]//li//a'):
categs.append(a.attrib["href"])
return categs
else:
return None
except Exception:
return None
def read_products(page):
return [
a_tag.attrib["href"]
for a_tag in page.xpath("//ul[#id='prodResult']/li//div[#class='imgWrapper']/a")
]
def read_all_categories(page):
cat = read_categories(page)
if not cat:
yield read_products(page)
else:
yield from read_all_categories(page)
def main():
main_page = get_page(TEST_LINK)
for links in read_all_categories(main_page):
print(links)
if __name__ == '__main__':
main()
Could someone please point me into the right direction regarding the recursion function?
Here is how I would solve this:
from lxml import html as html_parser
from requests import Session
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
}
def dig_up_products(url, session=Session()):
html = session.get(url, headers=HEADERS).text
page = html_parser.fromstring(html)
# if it appears to be a categories page, recurse
for link in page.xpath('//h2[contains(., "CATEGORIES")]/'
'following-sibling::div[#id="carouselSegment1b"]//li//a'):
yield from dig_up_products(link.attrib["href"], session)
# if it appears to be a products page, return the links
for link in page.xpath('//ul[#id="prodResult"]/li//div[#class="imgWrapper"]/a'):
yield link.attrib["href"]
def main():
start = 'https://www.richelieu.com/us/en/category/custom-made-cabinet-doors-and-drawers/1000128'
for link in dig_up_products(start):
print(link)
if __name__ == '__main__':
main()
There is nothing wrong with iterating over an empty XPath expression result, so you can simply put both cases (categories page/products page) into the same function, as long as the XPath expressions are specific enough to identify each case.
You can do like this as well to make your script slightly concise. I used lxml library along with css selector to do the job. The script will parse all the links under category and look for the dead end, when it appears then it parse title from there and do the whole stuff over and over again until all the links are exhausted.
from lxml.html import fromstring
import requests
def products_links(link):
res = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
page = fromstring(res.text)
try:
for item in page.cssselect(".contentHeading h1"): #check for the match available in target page
print(item.text)
except:
pass
for link in page.cssselect("h2:contains('CATEGORIES')+[id^='carouselSegment'] .touchcarousel-item a"):
products_links(link.attrib["href"])
if __name__ == '__main__':
main_page = 'https://www.richelieu.com/us/en/category/custom-made-cabinet-doors-and-drawers/1000128'
products_links(main_page)
Partial result:
BRILLANTÉ DOORS
BRILLANTÉ DRAWER FRONTS
BRILLANTÉ CUT TO SIZE PANELS
BRILLANTÉ EDGEBANDING
LACQUERED ZENIT DOORS
ZENIT CUT-TO-SIZE PANELS
EDGEBANDING
ZENIT CUT-TO-SIZE PANELS

Categories