I'm trying to first crawl through the main page of this website for the links to a table for each year. Then I'd like to scrape each site, while maintaining record of each year.
So far I have my spider constructed as:
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
I would then like to go through each link in this file and crawl through them, using the following logic:
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":
[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]
}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
My problem is that I couldn't find a way to do this effectively. Calling scrapy.Request on the url returns a response with just the content <html></html>. If there was a way where the response object could resemble the one given by the fetch command in Scrapy shell that would be ideal, since I've based the selection logic on testing with that command.
Edit:
Here's the entire spider so far
The idea is the run the first for loop to get the link and then the second for loop to extract the tables from said links.
import scrapy
import regex as re
from scrapy.http import HtmlResponse
import w3lib.html
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
for date,url in splits.items():
resp = HtmlResponse(url)
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
yield {
'Date': date,
'Scores': data_dict}
Initializing a HtmlResponse(url) doesn't accomplish anything, since the class doesn't make the request itself.
To add a request to scrapy's scheduler, you need to yield one, eg: yield scrapy.Request(url, callback=self.parse).
That being said, there are many improvements you can make to your spider.
Use scrapy's builtin LinkExtractor instead of string splitting
use css selectors instead of the hardcoded xpaths
use selector.root.text instead of w3lib.remove_tags (to remove the dependency entirely)
Here is a working example:
import scrapy
from scrapy.linkextractors import LinkExtractor
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
le = LinkExtractor(
allow_domains=self.allowed_domains,
restrict_xpaths='//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div',
)
for link in le.extract_links(response):
yield scrapy.Request(
url=link.url,
callback=self.parse_table,
cb_kwargs={ 'date': link.text },
)
def parse_table(self, response, date):
rows = response.css('#content table tbody tr')
if not rows:
print(f'No table found for url: {response.url}')
return
category = [char.root.text for char in rows[0].css('td strong')[1:]]
if not category:
category = [char.root.text for char in rows[0].css('td')[1:]]
for row in rows[1:]:
cols = row.css('td')
title = cols[0].root.text
nums = [col.root.text for col in cols[1:]]
yield {
'Date': date,
'Category': category,
title: nums
}
Note that your category parsing doesn't appear to work. I'm not exactly sure what you are trying to extract, so I'll leave that one for you.
Related
I am trying to scrape product URL from starting page and after that scrape each product details. For some weird reason, on smaller set of code which basically tells Scrapy to only scrape first "layer", it returns correct data without duplicates, but when I run it on a bigger set of code, it returns duplicates for 1st layer and returns correct data for 2nd layer of web page.
Here is the code for the simplified program:
import scrapy
from scrapy.linkextractors import LinkExtractor
class EglupiV2Spider(scrapy.Spider):
name = "eGlupi_vTest"
start_urls = ["https://www.ekupi.hr/hr/Ku%C4%87anski-aparati/Bijela-tehnika/Perilice---su%C5%A1ilice-rublja/c/10128"]
def parse(self, response):
main_category_selector = response.xpath("//ol[#class='breadcrumb']/li[2]/a/text()").extract_first().strip()
sub_category_selector = response.xpath("//ol[#class='breadcrumb']/li[3]/a/text()").extract_first().strip()
sub_sub_category_selector = response.xpath("//ol[#class='breadcrumb']/li[#class='active']/text()").extract_first().strip()
main_product_selector = "//div[#class='product-item']"
product_link_selector = ".//a[#class='thumb']/#href"
productData = {}
for product in response.xpath(main_product_selector):
productData["mainCategoryName"] = main_category_selector
productData["subCategoryName"] = sub_category_selector
productData["subSubCategoryName"] = sub_sub_category_selector
productData["productLink"] = "https://www.ekupi.hr" + product.xpath(product_link_selector).extract_first()
yield productData
le = LinkExtractor(restrict_xpaths=[
"//div[#class='pagination-bar top']/div[#class='pagination-toolbar']/div[#class='sort-refine-bar']/div[#class='row']/div[#class='col-xs-12 col-sm-6 col-md-5 pagination-wrap']/ul[#class='pagination']/li[#class='pagination-next']/a[#class='glyphicon glyphicon-chevron-right']"])
for link in le.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse)
Here is the more complex code that goes to 2nd layer and correctly scrapes product details, but returns duplicates for productLink unlike in the first program:
import scrapy
from scrapy.linkextractors import LinkExtractor
class EglupiV2Spider(scrapy.Spider):
name = "eGlupi_v2"
start_urls = ["https://www.ekupi.hr/hr/Ku%C4%87anski-aparati/Bijela-tehnika/Perilice---su%C5%A1ilice-rublja/c/10128"]
def parse(self, response):
productData = {}
main_category_selector = response.xpath("//ol[#class='breadcrumb']/li[2]/a/text()").extract_first().strip()
sub_category_selector = response.xpath("//ol[#class='breadcrumb']/li[3]/a/text()").extract_first().strip()
sub_sub_category_selector = response.xpath("//ol[#class='breadcrumb']/li[#class='active']/text()").extract_first().strip()
main_product_selector = "//div[#class='product-item']"
product_link_selector = ".//a[#class='thumb']/#href"
for product in response.xpath(main_product_selector):
productData["mainCategoryName"] = main_category_selector
productData["subCategoryName"] = sub_category_selector
productData["subSubCategoryName"] = sub_sub_category_selector
productData["productLink"] = "https://www.ekupi.hr" + product.xpath(product_link_selector).extract_first()
product_url = product.xpath(product_link_selector).extract_first()
yield response.follow(product_url, self.parse_product_detail, meta={"productData": productData}, dont_filter=True)
le = LinkExtractor(restrict_xpaths=["//div[#class='pagination-bar bottom']/div[#class='pagination-toolbar']/div[#class='sort-refine-bar']/div[#class='row']/div[#class='col-xs-12 col-sm-6 col-md-5 pagination-wrap']/ul[#class='pagination']/li[#class='pagination-next']/a[#class='glyphicon glyphicon-chevron-right']"])
for link in le.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse, dont_filter=True)
def parse_product_detail(self, response):
product_detail_selector = "//div[#class='product-main-info']"
table_rows = "//table/tbody"
productData = response.meta["productData"]
for productDetail in response.xpath(product_detail_selector):
productData["productName"] = productDetail.xpath(".//div[#class='product-details page-title hidden-xs hidden-sm']/div[#class='name']/text()").extract_first().strip()
productData["productPrice"] = productDetail.xpath(".//dd[#class='final-price']/text()").extract_first().strip()
productData["productPriceOld"] = productDetail.xpath(".//dd[#class='old-price']/text()").extract_first(default="").strip()
productData["productWarranty"] = productDetail.xpath(".//div[#class='info-hld']/p/b[contains(text(), 'Jamstvo:')]/text()").extract_first(default="").strip()
productData["productPaymentWay"] = productDetail.xpath(".//div[#class='info-hld']/p[contains(text(), 'Platite ')]/text()").extract_first(default="").strip()
productData["productReturn"] = productDetail.xpath(".//div[#class='info-hld']/p[contains(text(), 'Povrat ')]/text()").extract_first(default="").strip()
productData["productDeliveryDate"] = productDetail.xpath(".//span[#class='ddate']/text()").extract_first(default="").strip()
for table_row in response.xpath(table_rows):
first_column = table_row.xpath("./tr/td[1]/text()").extract_first().strip()
second_column = table_row.xpath("./tr/td[2]/text()").extract_first().strip()
productData[first_column] = second_column
yield productData
Here is the example which data is returned when I execute 2nd program. Sorry that I shared the link on Pastebin as SO allows only so much characters. Basically, it returns duplicate values for "productLink" Pastebin: https://pastebin.com/e2wrpfRu
Nevermind, lack of sleep.
Answer if anyone is looking for it: productData should be inside the for loop otherwise I'm just sharing same "productData" between all the iterations repopulating its values only...
I want to scrape the table data from http://5000best.com/websites/
The content of the table is paginated upto several pages and are dynamic.
I want to scrape the table data for each category. I can scrape the table manually for each category but this is not what I want.
Please look at it and give me the approach to do it.
I am able to make links for each category i.e. http://5000best.com/websites/Movies/, http://5000best.com/websites/Games/ etc.
But I am not sure how to make it further to navigate through paginated table for each category.
And after making all the links I need to extract table data by using that links.
Edit : I am using requests, BeautifulSoup4,
Simple Scrapy spider:
import scrapy
class Best500Spider(scrapy.Spider):
name = "best5000"
start_urls = ['http://5000best.com/websites/1']
def parse(self, response):
for row in response.xpath('//table[#id="ttable"]//tr'):
record = {}
record["Rank"] = row.xpath('./td[1]/text()').get()
record["Score"] = row.xpath('./td[2]/text()').get()
record["Category"] = row.xpath('string(./td[3])').get()
record["URL"] = row.xpath('string(./td[5])').get()
yield record
next_page_url = response.xpath('//div[#id="dpages"]/span[#class="pagen0"]/following-sibling::span[1]/a/#href').get()
if next_page_url:
yield scrapy.Request(
url=response.urljoin(next_page_url),
callback=self.parse
)
i saw the site and to move to another page just add /pageNumber at the end of the link ,
For example
http://5000best.com/websites/50 : will get you page 50
You can use this tool to get python requests code for one page and add a loop : https://curl.trillworks.com/
just put "curl http://5000best.com/websites/50" and adapt your code after
I came with this approach to scrape tables from each category.
# ------------Hemant Sah--------------------
# <- --------Importing Libraries-------- ->
import requests
from bs4 import BeautifulSoup
import pandas as pd
import math
import itertools
import requests
import sqlalchemy
import re
final_list = []
dataframe = pd.DataFrame([])
def make_soup(url):
try:
html = requests.get(url)
except requests.exceptions.HTTPError as e:
print(e)
else:
soup = BeautifulSoup(html.text,'lxml')
# print(html.status_code)
return soup
def get_categories_from_soup(soup):
total_list = []
for item in soup.find_all('div',{"class":"sca2"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_a"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_b"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_c"}):
total_list.append(item.text)
total_list.remove("All (5000)")
total_list.remove("Porn (201)")
return total_list
def make_url(total_list,url):
path, page_num, test_page_num, modified_links, new_links = [],[],[],[],[]
for category in total_list:
reg_exp_path = re.compile(r'^\w+')
path.extend(reg_exp_path.findall(category))
test_page_num.extend(re.findall('[0-9]+',category))
# print(path)
for c in test_page_num:
temp = math.ceil(int(c)/100)
page_num.append(temp)
# print(page_num)
# print(page_num)
for p in path:
links= (url+p+"/")
modified_links.append(links)
# print(modified_links)
for w,p in zip(modified_links,page_num):
for n in range(1,p+1):
temp = w+str(n)
new_links.append(temp)
print(new_links)
return new_links
def fetch_table_data(links):
for l in links:
soup = make_soup(l)
my_table = soup.find('table',{'id':'ttable'})
rows = my_table.find_all('tr')
for tr in rows:
td = tr.find_all('td')
row = [tr.text for tr in td]
final_list.append(row)
df = pd.DataFrame(final_list,columns=["Rank","Score","Category","Audience","URL","Links","blank","Desc"])
print(df)
df = df.drop("blank", axis=1)
# print(df)
return df
# df.to_csv('final_data.csv')
def main():
url = "http://5000best.com/websites/"
soup = make_soup(url)
total_list = get_categories_from_soup(soup)
links = make_url(total_list, url)
dataframe = fetch_table_data(links)
if __name__ == "__main__":
main()
import scrapy
class rlgSpider(scrapy.Spider):
name = 'bot'
start_urls = [
'https://rocket-league.com/trading?filterItem=0&filterCertification=0&filterPaint=0&filterPlatform=1&filterSearchType=1&filterItemType=0&p=1']
def parse(self, response):
data = {}
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data['name'] = item.xpath('//div/div[#position ="relative"]/h2').extarct()
yield data
Here is what I did so far - it doesn't work well. It scrapes the url and not the h2 tag how do I do that when it's inside so many divs?
In order to parse though an element in scrapy you need to start your xpath with "." else you will be parsing through the response, this is the correct way of doing it.
def parse(self, response):
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('.//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data = {}
data['name'] = item.xpath('.//h2/text()').extarct_first()
yield data
I'm currently writing vacancies scraper with Scrapy to parse about 3M of vacancies item.
Now I'm on place when spider works and successfully scraping items and storing it tot postgreesql but the thing is it doing it pretty slow.
For 1 hr i stored only 12k vacancies so i'm really ti far from 3M of them.
Thing is that in the end i'm gonna need to scrape and update data once per day and with current performance I'm gonna need more than a day to just parse all data.
I'm new in data scraping so I may do some basic thing wrong and I'll be very gratefull if anybody can hel me.
Code of my spider:
import scrapy
import urllib.request
from lxml import html
from ..items import JobItem
class AdzunaSpider(scrapy.Spider):
name = "adzuna"
start_urls = [
'https://www.adzuna.ru/search?loc=136073&pp=10'
]
def parse(self, response):
job_items = JobItem()
items = response.xpath("//div[#class='sr']/div[#class='a']")
def get_redirect(url):
response = urllib.request.urlopen(url)
response_code = response.read()
result = str(response_code, 'utf-8')
root = html.fromstring(result)
final_url = root.xpath('//p/a/#href')[0]
final_final_url = final_url.split('?utm', 1)[0]
return final_final_url
for item in items:
id = None
data_aid = item.xpath(".//#data-aid").get()
redirect = item.xpath(".//h2/a/#href").get()
url = get_redirect(redirect)
url_header = item.xpath(".//h2/a/strong/text()").get()
if item.xpath(".//p[#class='as']/#data-company-name").get() == None:
company = item.xpath(".//p[#class='as']/text()").get()
else:
company = item.xpath(".//p[#class='as']/#data-company-name").get()
loc = item.xpath(".//p/span[#class='loc']/text()").get()
text = item.xpath(".//p[#class='at']/span[#class='at_tr']/text()").get()
salary = item.xpath(".//p[#class='at']/span[#class='at_sl']/text()").get()
job_items['id'] = id
job_items['data_aid'] = data_aid
job_items['url'] = url
job_items['url_header'] = url_header
job_items['company'] = company
job_items['loc'] = loc
job_items['text'] = text
job_items['salary'] = salary
yield job_items
next_page = response.css("table.pg td:last-child ::attr('href')").get()
if next_page is not None:
yield response.follow(next_page, self.parse)
Use indexes in your table
Insert in BULK instead of inserting one-by-one
Minimize use of meta in your Request
Use tuple instead of list where possible
Set CONCURRENT_ITEMS=100, setting it to higher decreases performance
Try to use less Middlewares and Pipielines
Set AUTOTHROTTLE_ENABLED=False in settings.py
Set TELNETCONSOLE_ENABLED=False in settings.py
I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old