I am fairly new to python as well as web scraping. My first project is web scraping random Craiglist cities (5 cities total) under the transportation sub-domain (i.e. https://dallas.craigslist.org), though I am stuck on having to manually run the script per city after manually updating each cities respective domain under the constants >>>> (start_urls = and absolute_next_url = ) in the script . Is there anyway that I can adjust the script to sequentially run through the cities I have defined (i.e. miami, new york, houston, chicago, etc), and auto-populate the constants for its respective city (start_urls = and absolute_next_url = ) ?
Also, is there a way to adjust the script to output each city into its own .csv >> (i.e. miami.csv, houston.csv, chicago.csv, etc) ?
Thank you in advance
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
class JobsSpider(scrapy.Spider):
name = "jobs"
allowed_domains = ["craigslist.org"]
start_urls = ['https://dallas.craigslist.org/d/transportation/search/trp']
def parse(self, response):
jobs = response.xpath('//p[#class="result-info"]')
for job in jobs:
listing_title = job.xpath('a/text()').extract_first()
city = job.xpath('span[#class="result-meta"]/span[#class="result-hood"]/text()').extract_first("")[2:-1]
job_posting_date = job.xpath('time/#datetime').extract_first()
job_posting_url = job.xpath('a/#href').extract_first()
data_id = job.xpath('a/#data-id').extract_first()
yield Request(job_posting_url, callback=self.parse_page, meta={'job_posting_url': job_posting_url, 'listing_title': listing_title, 'city':city, 'job_posting_date':job_posting_date, 'data_id':data_id})
relative_next_url = response.xpath('//a[#class="button next"]/#href').extract_first()
absolute_next_url = "https://dallas.craigslist.org" + relative_next_url
yield Request(absolute_next_url, callback=self.parse)
def parse_page(self, response):
job_posting_url = response.meta.get('job_posting_url')
listing_title = response.meta.get('listing_title')
city = response.meta.get('city')
job_posting_date = response.meta.get('job_posting_date')
data_id = response.meta.get('data_id')
description = "".join(line for line in response.xpath('//*[#id="postingbody"]/text()').extract()).strip()
compensation = response.xpath('//p[#class="attrgroup"]/span[1]/b/text()').extract_first()
employment_type = response.xpath('//p[#class="attrgroup"]/span[2]/b/text()').extract_first()
latitude = response.xpath('//div/#data-latitude').extract_first()
longitude = response.xpath('//div/#data-longitude').extract_first()
posting_id = response.xpath('//p[#class="postinginfo"]/text()').extract()
#yield{'job_posting_url': job_posting_url, 'listing_title': listing_title, 'city':city, 'job_posting_date':job_posting_date, 'description':description, #'compensation':compensation, 'employment_type':employment_type, 'posting_id':posting_id, 'longitude':longitude, 'latitude':latitude }
yield{'job_posting_url':job_posting_url,
'data_id':data_id,
'listing_title':listing_title,
'city':city,
'description':description,
'compensation':compensation,
'employment_type':employment_type,
'latitude':latitude,
'longitude':longitude,
'job_posting_date':job_posting_date,
'posting_id':posting_id,
'data_id':data_id
}
There might be a cleaner way but check out https://docs.scrapy.org/en/latest/topics/practices.html?highlight=multiple%20spiders and you can basically combine multiple instances of your spider together, so you can have a separate 'class' for each city. There are probably some ways to consolidate some code so it's not all repeated.
As for writing to csv, are you doing that via the command line right now? I'd add the code to the spider itself https://realpython.com/python-csv/
Related
I am building a spider with scrapy, I want to access in every item in a list and then scrape all the data inside each link. but when I run the spider it doesn´t scrape the data. What I am missing?
from ..items import JobscraperItem
from scrapy.linkextractors import LinkExtractor
class JobscraperSpider(scrapy.Spider):
name ='jobspider'
start_urls = ['https://cccc/bolsa/ofertas?oferta=&lugar=&categoria=']
def parse(self, response):
job_detail = response.xpath('//div[#class="list"]/div/a')
yield from response.follow_all(job_detail, self.parse_jobspider)
def parse(self, response):
items = JobscraperItem()
job_title = response.xpath('//h1/text()').extract()
company = response.xpath('//h2/b/text()').extract()
company_url = response.xpath('//div[#class="pull-left"]/a/text()').extract()
description = response.xpath('//div[#class="aviso"]/text()').extract()
salary = response.xpath('//div[#id="aviso"]/p[1]/text()').extract()
city = response.xpath('//div[#id="aviso"]/p[2]/text()').extract()
district = response.xpath('//div[#id="aviso"]/p[5]/text()').extract()
publication_date = response.xpath('//div[#id="publicado"]/text()').extract()
apply = response.xpath('//p[#class="text-center"]/b/text()').extract()
job_type = response.xpath('//div[#id="resumen"]/p[3]/text()').extract()
items['job_title'] = job_title
items['company'] = company
items['company_url'] = company_url
items['description'] = description
items['salary'] = salary
items['city'] = city
items['district'] = district
items['publication_date'] = publication_date
items['apply'] = apply
items['job_type'] = job_type
yield items```
From what I can see, one of the issues is that you are creating two functions called parse(). Since you are using a self.parse_jobspider in your first parse function, I'm guessing that your second parse function is named incorrectly.
Also, are you sure that the URL in the start_urls is correct? https://cccc/bolsa/ofertas?oferta=&lugar=&categoria= doesn't direct to anywhere which would also explain why data isn't being scraped.
rules = (
Rule(LinkExtractor(allow=('/bolsa/166',)), follow=True, callback='parse_item'),
)
I resolve this adding this code to access in every link and scrape the data inside
I'm currently writing vacancies scraper with Scrapy to parse about 3M of vacancies item.
Now I'm on place when spider works and successfully scraping items and storing it tot postgreesql but the thing is it doing it pretty slow.
For 1 hr i stored only 12k vacancies so i'm really ti far from 3M of them.
Thing is that in the end i'm gonna need to scrape and update data once per day and with current performance I'm gonna need more than a day to just parse all data.
I'm new in data scraping so I may do some basic thing wrong and I'll be very gratefull if anybody can hel me.
Code of my spider:
import scrapy
import urllib.request
from lxml import html
from ..items import JobItem
class AdzunaSpider(scrapy.Spider):
name = "adzuna"
start_urls = [
'https://www.adzuna.ru/search?loc=136073&pp=10'
]
def parse(self, response):
job_items = JobItem()
items = response.xpath("//div[#class='sr']/div[#class='a']")
def get_redirect(url):
response = urllib.request.urlopen(url)
response_code = response.read()
result = str(response_code, 'utf-8')
root = html.fromstring(result)
final_url = root.xpath('//p/a/#href')[0]
final_final_url = final_url.split('?utm', 1)[0]
return final_final_url
for item in items:
id = None
data_aid = item.xpath(".//#data-aid").get()
redirect = item.xpath(".//h2/a/#href").get()
url = get_redirect(redirect)
url_header = item.xpath(".//h2/a/strong/text()").get()
if item.xpath(".//p[#class='as']/#data-company-name").get() == None:
company = item.xpath(".//p[#class='as']/text()").get()
else:
company = item.xpath(".//p[#class='as']/#data-company-name").get()
loc = item.xpath(".//p/span[#class='loc']/text()").get()
text = item.xpath(".//p[#class='at']/span[#class='at_tr']/text()").get()
salary = item.xpath(".//p[#class='at']/span[#class='at_sl']/text()").get()
job_items['id'] = id
job_items['data_aid'] = data_aid
job_items['url'] = url
job_items['url_header'] = url_header
job_items['company'] = company
job_items['loc'] = loc
job_items['text'] = text
job_items['salary'] = salary
yield job_items
next_page = response.css("table.pg td:last-child ::attr('href')").get()
if next_page is not None:
yield response.follow(next_page, self.parse)
Use indexes in your table
Insert in BULK instead of inserting one-by-one
Minimize use of meta in your Request
Use tuple instead of list where possible
Set CONCURRENT_ITEMS=100, setting it to higher decreases performance
Try to use less Middlewares and Pipielines
Set AUTOTHROTTLE_ENABLED=False in settings.py
Set TELNETCONSOLE_ENABLED=False in settings.py
I am trying to scrap information about every firm in from this website : www.canadianlawlist.com
I have finished most of it, but I am running into a small problem.
I am trying to get the results to display in the following order :
-Firm Name and Information
*Employees from the firm Information.
But instead I am getting very random results.
It will scrape information about 2 firms and then scrap the information of employees. Like that :
-Firm Name and Information
-Firm name and information
*Employee from Firm 1
-Firm name and information
*Employee from Firm 2
It goes something like that . I am not sure what i am missing in my code :
def parse_after_submit(self, response):
basicurl = "canadianlawlist.com/"
products = response.xpath('//*[#class="searchresult_item_regular"]/a/#href').extract()
for p in products:
url = "http://canadianlawlist.com" + p
yield scrapy.Request(url, callback=self.parse_firm_info)
#process next page
#for x in range(2, 6):
# next_page_url = "https://www.canadianlawlist.com/searchresult?searchtype=firms&city=montreal&page=" + str(x)
def parse_firm_info(self,response):
name = response.xpath('//div[#class="listingdetail_companyname"]/h1/span/text()').extract_first()
print name
for info in response.xpath('//*[#class="listingdetail_contactinfo"]'):
street_address = info.xpath('//div[#class="listingdetail_contactinfo"]/div[1]/span/div/text()').extract_first()
city = info.xpath('//*[#itemprop="addressLocality"]/text()').extract_first(),
province = info.xpath('//*[#itemprop="addressRegion"]/text()').extract_first(),
postal_code = info.xpath('//*[#itemprop="postalCode"]/text()').extract_first(),
telephone = info.xpath('//*[#itemprop="telephone"]/text()').extract_first(),
fax_number = info.xpath('//*[#itemprop="faxNumber"]/text()').extract_first(),
email = info.xpath('//*[#itemprop="email"]/text()').extract_first(),
print street_address
print city
print province
print postal_code
print telephone
print fax_number
print email
for people in response.xpath('////div[#id="main_block"]/div[1]/div[2]/div[2]'):
pname = people.xpath('//*[#class="listingdetail_individual_item"]/h3/a/text()').extract()
print pname
basicurl = "canadianlawlist.com/"
employees = response.xpath('//*[#class="listingdetail_individual_item"]/h3/a/#href').extract()
for e in employees:
url2 = "http://canadianlawlist.com" + e
yield scrapy.Request(url2, callback=self.parse_employe_info)
def parse_employe_info(self,response):
ename = response.xpath('//*[#class="listingdetail_individualname"]/h1/span/text()').extract_first()
job_title = response.xpath('//*[#class="listingdetail_individualmaininfo"]/div/i/span/text()').extract_first()
print ename
print job_title
You cannot rely on the order Python's print function when it comes to concurrent programming. If you care about standard output order you need to use logging module.
Scrapy has shortcut function for that in Spider class:
import scrapy
import logging
class MySpider(scrapy.Spider):
def parse(self, response):
self.log("first message", level=logging.INFO)
self.log("second message", level=logging.INFO)
Scrapy run multiple requests at the same time, so the content displayed on the console can be corresponding to any the multiple requests running at same time.
You can go to settings.py and set
CONCURRENT_REQUESTS = 1
Now only one request will be launched at a time so your console will show meaningful data but this will make the scraping slower.
As some of you may have gathered, I'm learning scrapy to scrape some data off of Google Scholar for a research project that I am running. I have a file that contains many article titles for which I am scraping citations. I read in the file using pandas, generate the URLs that need scraping, and start scraping.
One problem that I face is 503 errors. Google shuts me off fairly quickly, and many entries remain unscraped. This is a problem that I am working on using some middleware provided by Crawlera.
Another problem I face is that when I export my scraped data, I have a hard time matching the scraped data to what I was trying to look for. My input data is a CSV file with three fields -- 'Authors','Title','pid' where 'pid' is a unique identifier.
I use pandas to read in the file and generate URLs for scholar based off the title. Each time a given URL is scraped, my spider goes through the scholar webpage, and picks up the title, publication information and cites for each article listed on that page.
Here is how I generate the links for scraping:
class ScholarSpider(Spider):
name = "scholarscrape"
allowed_domains = ["scholar.google.com"]
# get the data
data = read_csv("../../data/master_jeea.csv")
# get the titles
queries = data.Title.apply(urllib.quote)
# generate a var to store links
links = []
# create the URLs to crawl
for entry in queries:
links.append("http://scholar.google.com/scholar?q=allintitle%3A"+entry)
# give the URLs to scrapy
start_urls = links
For example, one title from my data file could be the paper 'Elephants Don't Play Chess' by Rodney Brooks with 'pid' 5067. The spider goes to
http://scholar.google.com/scholar?q=allintitle%3Aelephants+don%27t+play+chess
Now on this page, there are six hits. The spider gets all six hits, but they need to be assigned the same 'pid'. I know I need to insert a line somewhere that reads something like item['pid'] = data.pid.apply("something") but I can't figure out exactly how I would do that.
Below is the rest of the code for my spider. I am sure the way to do this is pretty straightforward, but I can't think of how to get the spider to know which entry of data.pid it should look for if that makes sense.
def parse(self, response):
# initialize something to hold the data
items=[]
sel = Selector(response)
# get each 'entry' on the page
# an entry is a self contained div
# that has the title, publication info
# and cites
entries = sel.xpath('//div[#class="gs_ri"]')
# a counter for the entry that is being scraped
count = 1
for entry in entries:
item = ScholarscrapeItem()
# get the title
title = entry.xpath('.//h3[#class="gs_rt"]/a//text()').extract()
# the title is messy
# clean up
item['title'] = "".join(title)
# get publication info
# clean up
author = entry.xpath('.//div[#class="gs_a"]//text()').extract()
item['authors'] = "".join(author)
# get the portion that contains citations
cite_string = entry.xpath('.//div[#class="gs_fl"]//text()').extract()
# find the part that says "Cited by"
match = re.search("Cited by \d+",str(cite_string))
# if it exists, note the number
if match:
cites = re.search("\d+",match.group()).group()
# if not, there is no citation info
else:
cites = None
item['cites'] = cites
item['entry'] = count
# iterate the counter
count += 1
# append this item to the list
items.append(item)
return items
I hope this question is well-defined, but please let me know if I can be more clear. There is really not much else in my scraper except some lines at the top importing things.
Edit 1: Based on suggestions below, I have modified my code as follows:
# test-case: http://scholar.google.com/scholar?q=intitle%3Amigratory+birds
import re
from pandas import *
import urllib
from scrapy.spider import Spider
from scrapy.selector import Selector
from scholarscrape.items import ScholarscrapeItem
class ScholarSpider(Spider):
name = "scholarscrape"
allowed_domains = ["scholar.google.com"]
# get the data
data = read_csv("../../data/master_jeea.csv")
# get the titles
queries = data.Title.apply(urllib.quote)
pid = data.pid
# generate a var to store links
urls = []
# create the URLs to crawl
for entry in queries:
urls.append("http://scholar.google.com/scholar?q=allintitle%3A"+entry)
# give the URLs to scrapy
start_urls = (
(urls, pid),
)
def make_requests_from_url(self, (url,pid)):
return Request(url, meta={'pid':pid}, callback=self.parse, dont_filter=True)
def parse(self, response):
# initialize something to hold the data
items=[]
sel = Selector(response)
# get each 'entry' on the page
# an entry is a self contained div
# that has the title, publication info
# and cites
entries = sel.xpath('//div[#class="gs_ri"]')
# a counter for the entry that is being scraped
count = 1
for entry in entries:
item = ScholarscrapeItem()
# get the title
title = entry.xpath('.//h3[#class="gs_rt"]/a//text()').extract()
# the title is messy
# clean up
item['title'] = "".join(title)
# get publication info
# clean up
author = entry.xpath('.//div[#class="gs_a"]//text()').extract()
item['authors'] = "".join(author)
# get the portion that contains citations
cite_string = entry.xpath('.//div[#class="gs_fl"]//text()').extract()
# find the part that says "Cited by"
match = re.search("Cited by \d+",str(cite_string))
# if it exists, note the number
if match:
cites = re.search("\d+",match.group()).group()
# if not, there is no citation info
else:
cites = None
item['cites'] = cites
item['entry'] = count
item['pid'] = response.meta['pid']
# iterate the counter
count += 1
# append this item to the list
items.append(item)
return items
You need to populate your list start_urls with tuples (url, pid).
Now redefine the method make_requests_from_url(url):
class ScholarSpider(Spider):
name = "ScholarSpider"
allowed_domains = ["scholar.google.com"]
start_urls = (
('http://www.scholar.google.com/', 100),
)
def make_requests_from_url(self, (url, pid)):
return Request(url, meta={'pid': pid}, callback=self.parse, dont_filter=True)
def parse(self, response):
pid = response.meta['pid']
print '!!!!!!!!!!!', pid, '!!!!!!!!!!!!'
pass
I am attempting to scrape the Library of Congress/Thomas website. This Python script is intended to access a sample of 40 bills from their site (# 1-40 identifiers in the URLs). I want to parse the body of each piece of legislation, search in the body/content, extract links to potential multiple versions & follow.
Once on the version page(s) I want to parse the body of each piece of legislation, search the body/content & extract links to potential sections & follow.
Once on the section page(s) I want to parse the body of each section of a bill.
I believe there is some issue with the Rules/LinkExtractor segment of my code. The python code is executing, crawling the start urls, but not parsing or any of the subsequent tasks.
Three issues:
Some bills do not have multiple versions (and ergo no links in the body portion of the URL
Some bills do not have linked sections because they are so short, while some are nothing but links to sections.
Some section links do not contain just section-specific content, and most of the content is just redundant inclusion of prior or subsequent section content.
My question is again, why is Scrapy not crawling or parsing?
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class BillItem(Item):
title = Field()
body = Field()
class VersionItem(Item):
title = Field()
body = Field()
class SectionItem(Item):
body = Field()
class Lrn2CrawlSpider(CrawlSpider):
name = "lrn2crawl"
allowed_domains = ["thomas.loc.gov"]
start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767
]
rules = (
# Extract links matching /query/ fragment (restricting tho those inside the content body of the url)
# and follow links from them (since no callback means follow=True by default).
# Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse.
Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[#id="content"]')), callback='parse_bills', follow=True),
# Extract links in the body of a bill-version & follow them.
#Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse.
Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True)
)
def parse_bills(self, response):
hxs = HtmlXPathSelector(response)
bills = hxs.select('//div[#id="content"]')
scraped_bills = []
for bill in bills:
scraped_bill = BillItem() ### Bill object defined previously
scraped_bill['title'] = bill.select('p/text()').extract()
scraped_bill['body'] = response.body
scraped_bills.append(scraped_bill)
return scraped_bills
def parse_versions(self, response):
hxs = HtmlXPathSelector(response)
versions = hxs.select('//div[#id="content"]')
scraped_versions = []
for version in versions:
scraped_version = VersionItem() ### Version object defined previously
scraped_version['title'] = version.select('center/b/text()').extract()
scraped_version['body'] = response.body
scraped_versions.append(scraped_version)
return scraped_versions
def parse_sections(self, response):
hxs = HtmlXPathSelector(response)
sections = hxs.select('//div[#id="content"]')
scraped_sections = []
for section in sections:
scraped_section = SectionItem() ## Segment object defined previously
scraped_section['body'] = response.body
scraped_sections.append(scraped_section)
return scraped_sections
spider = Lrn2CrawlSpider()
Just for the record, the problem with your script is that the variable rules is not inside the scope of Lrn2CrawlSpider because it doesn't share the same indentation, so when alecxe fixed the indentation the variable rules became now an attribute of the class. Later the inherited method __init__() reads the attribute and compiles the rules and enforces them.
def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
Erasing the last line had nothing to do with that.
I've just fixed the indentation, removed spider = Lrn2CrawlSpider() line at the end of the script, ran the spider via scrapy runspider lrn2crawl.py and it scrapes, follows links, returns items - your rules work.
Here's what I'm running:
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class BillItem(Item):
title = Field()
body = Field()
class VersionItem(Item):
title = Field()
body = Field()
class SectionItem(Item):
body = Field()
class Lrn2CrawlSpider(CrawlSpider):
name = "lrn2crawl"
allowed_domains = ["thomas.loc.gov"]
start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767
]
rules = (
# Extract links matching /query/ fragment (restricting tho those inside the content body of the url)
# and follow links from them (since no callback means follow=True by default).
# Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse.
Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[#id="content"]')), callback='parse_bills', follow=True),
# Extract links in the body of a bill-version & follow them.
#Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse.
Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True)
)
def parse_bills(self, response):
hxs = HtmlXPathSelector(response)
bills = hxs.select('//div[#id="content"]')
scraped_bills = []
for bill in bills:
scraped_bill = BillItem() ### Bill object defined previously
scraped_bill['title'] = bill.select('p/text()').extract()
scraped_bill['body'] = response.body
scraped_bills.append(scraped_bill)
return scraped_bills
def parse_versions(self, response):
hxs = HtmlXPathSelector(response)
versions = hxs.select('//div[#id="content"]')
scraped_versions = []
for version in versions:
scraped_version = VersionItem() ### Version object defined previously
scraped_version['title'] = version.select('center/b/text()').extract()
scraped_version['body'] = response.body
scraped_versions.append(scraped_version)
return scraped_versions
def parse_sections(self, response):
hxs = HtmlXPathSelector(response)
sections = hxs.select('//div[#id="content"]')
scraped_sections = []
for section in sections:
scraped_section = SectionItem() ## Segment object defined previously
scraped_section['body'] = response.body
scraped_sections.append(scraped_section)
return scraped_sections
Hope that helps.