Scrapy MultiCSVItemPipeline exports some empty items

Scrapy MultiCSVItemPipeline exports some empty items - python

I got multiple Spiders with different items and i want to export each item into a different csv file. I used the code example from How can scrapy export items to separate csv files per item, but there is a problem.
Right now my spider will only write the "page" item. All items is filled in the shell but the files keep being empty. I debugged the pipeline, but I didn't found an error so far.
Here is my spider:
import csv
import scrapy
from BeautifulSoup import BeautifulSoup
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.spiders import Rule
from DataSpiders import CSV_PATH
from ScrapingItems import TrierDeItem
from SuperSpider import SuperSpider
HTML_PATH = 'pages/trier.de/'
class TrierDeSpider(scrapy.Spider, SuperSpider):
name = 'trierDeSpider'
allowed_domains = ['trier.de']
denied_domains = []
start_urls = [
'https://www.trier.de/rathaus-buerger-in/trier-in-zahlen/',
'https://trier.de/startseite/',
'https://www.trier.de/leben-in-trier/',
'https://www.trier.de/kultur-freizeit/',
'https://www.trier.de/wirtschaft-arbeit/',
'https://www.trier.de/bildung-wissenschaft/',
'https://www.trier.de/bauen-wohnen/',
'https://www.trier.de/umwelt-verkehr/',
]
# Set starting point for the spider and starts crawling from start_urls
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse', follow=True),)
def parse(self, response):
"""
Parse for Links Page Body. Follow allowed Domains by adding them to the request. Parse the current page with
callback and the method parse_page.
:param response:
:return:
"""
for link in LxmlLinkExtractor(allow=self.allowed_domains, deny=self.denied_domains).extract_links(response):
yield scrapy.Request(response.urljoin(link.url), callback=self.parse_page)
def parse_page(self, response):
"""
Parse the current page for information.
:param response:
:return:
"""
trier_de_item = TrierDeItem()
yield self.parse_general_page_info(response, HTML_PATH)
# extract the page url
trier_de_item["url"] = response.url
# extract the crawling datetime
trier_de_item["crawling_date_time"] = response.headers['Date']
# extract page title
trier_de_item["title"] = response.css('title::text').extract()
# extract description tags
trier_de_item["description"] = response.xpath('//meta[#name="description"]/#content').extract()
trier_de_item["og_description"] = response.xpath('//meta[#name="og:description"]/#content').extract()
# extract all page headers
trier_de_item["news_title"] = response.xpath('//div[#class="dachzeile"]/text()').extract()
# extract topic
trier_de_item["topic"] = response.xpath('//div[#class="topic"]/text()').extract()
# extract headlines
trier_de_item['headlines'] = response.xpath('//h1/text()').extract()
# check if page contains a table
table = response.xpath('//table[#class="datentabelle"]').extract()
if len(table) > 0:
self.parse_table(response.body, trier_de_item['headlines'][0])
yield trier_de_item
#staticmethod
def parse_table(body_html, title):
'''
Parse HTML Page with table and save to csv file
:param body_html:
:param title:
:return:
'''
title = title.replace('/', '')
try:
# Create Filename from title
filename = title + '.csv'
soup = BeautifulSoup(body_html)
soup.prettify('utf-8')
content = []
# find all tables in html
tables = soup.findAll('table')
for table in tables:
# find reach table row
for row in table.findAll('tr'):
# extract each table header and row and extract text to line from each row
line = []
for header in row.findAll('th'):
if ' ' in header.text:
line.append('')
else:
line.append(header.text)
for row in row.findAll('td'):
if ' ' in row.text:
line.append('')
else:
line.append(row.text)
content.append(line)
# Open a new csv file an write each line to the file
with open(CSV_PATH + filename, 'wb') as csv_file:
wr = csv.writer(csv_file)
for line in content:
wr.writerow(line)
except Exception as e:
print(e)
pass
SuperSpider:
import urlparse
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from DataSpiders import write_html
from DataSpiders.ScrapingItems import PageItem, BaseItem
ALLOWED_FILE_TYPES = ('.pdf', '.csv', '.xls', '.xlsx')
class SuperSpider:
def __init__(self):
pass
def url_join(self, urls, response):
'''
Join URL with response
:param urls:
:param response:
:return:
'''
joined_urls = []
for url in urls:
joined_urls.append(response.urljoin(url))
return joined_urls
def parse_general_page_info(self, response, HTML_PATH):
page_item = PageItem()
page_item["url"] = response.url
# extract respones body
if 'jsp' in response.url:
url = response.url.split('.jsp')
write_html(url[0], response.body, HTML_PATH)
elif '?' in response.url:
url = response.url.split('?')
write_html(url[0], response.body, HTML_PATH)
else:
write_html(response.url, response.body, HTML_PATH)
# Search for files that contain any allowed file type
found_files = []
domain = response.url.split('/')[2]
for a in response.xpath('//a[#href]/#href'):
link = a.extract()
if link.endswith(ALLOWED_FILE_TYPES):
link = urlparse.urljoin(domain, link)
found_files.append(link)
# extract all refering links
extractor = LxmlLinkExtractor()
linklist = []
for link in extractor.extract_links(response):
# extract links which contain a file in url and add those to 'found_files' for downloading
if '?imgUid' in link.url:
fullpath = link.url
path = fullpath.split('.de')[1]
found_files.append(urlparse.urljoin(domain, path))
else:
linklist.append(link.url)
page_item["links"] = linklist
# add all files to lokaloItem
page_item["file_urls"] = self.url_join(found_files, response)
# extract page title
page_item["title"] = response.css('title::text').extract()
# extract all image urls
relative_img_urls = response.css("img::attr(src)").extract()
page_item["image_urls"] = self.url_join(relative_img_urls, response)
return page_item
def parse_base_page_information(self, response):
baseItem = BaseItem()
baseItem["url"] = response.url
# extract page title
baseItem["title"] = response.css('title::text').extract()
baseItem["crawling_date_time"] = response.headers['Date']
# extract description tags
baseItem["description"] = response.xpath('//meta[#name="description"]/#content').extract()
baseItem["og_description"] = response.xpath('//meta[#name="og:description"]/#content').extract()
baseItem['headlines'] = response.xpath('//h1/text()').extract()
return baseItem
ScrapingItems:
from scrapy import Item, Field
class PageItem(Item):
url = Field()
title = Field()
image_urls = Field()
file_urls = Field()
links = Field()
class BaseItem(Item):
url = Field()
title = Field()
crawling_date_time = Field()
description = Field()
og_description = Field()
headlines = Field()
class TrierDeItem(BaseItem):
news_title = Field()
tag = Field()
topic = Field()
And the Multi CSV Pipeline:
class MultiCSVItemPipeline(object):
CSVPath = "csv_data/"
SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([(name, open(self.CSVPath + name + '.csv', 'ab')) for name in self.SaveTypes])
self.exporters = dict([(name, CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
def item_type(item):
'''
Returns the scraping item name
:param item:
:return:
'''
return type(item).__name__.replace('Item', '').lower()
I haven't found a solution to this right now, but I tried several things that failed.
yield list of items, which doesn't work with scrapy
yield only one item and create two parse methods for page_item and trier_item
delete all SaveTypes but 'trierde'. The spider didn't write anything
So, relating to these options I tried I believe, that there is some error with the pipeline itself...
I appreciate any help anybody can offer.
Additional Info:
Before changing my pipeline to MultiCSV I was able to save each item to csv.

After I wasn't able to fix the problem with the Scrapy exporter I decided to create my own exporter.
Here's the code for everyone who want's to export multiple, different Items to different csv files in one or more spiders. It worked for me so far, but I'm still checking the code for errors. Feel free to reply, if you got some ideas for improvement.
class MultiCSVItemPipeline(object):
# Subfolder path, where the csv files are stored
CSVPath = "csv_data/"
# All allowed items
SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']
# List for already checked csv headers
CheckedHeaders = []
def __init__(self):
import sys
reload(sys)
sys.setdefaultencoding('utf8')
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
# Check if items exists and create new ones if not
for file in set(self.SaveTypes):
f = open(self.CSVPath + file + '.csv', 'a+')
f.close()
def spider_closed(self, spider):
# not needed anymore
# [e.finish_exporting() for e in self.exporters.values()]
# [f.close() for f in self.files.values()]
pass
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
try:
# Check if csv file contains header, but only those, that aren't checked
if what not in self.CheckedHeaders:
self.check_header(what, item)
self.write_item_to_row(item, what)
except Exception as e:
logging.error("########################################################")
logging.error("Error writing to " + what + ".csv file ")
logging.error("Error Message: " + e.message)
logging.error("Error Reason: " + e.reason)
logging.error("Error Object: " + e.object)
logging.error("########################################################")
return item
def write_item_to_row(self, item, what):
"""
Write a single item to a row in csv file
:param item:
:param what:
:return:
"""
ofile = open(self.CSVPath + what + '.csv', "ab")
writer = csv.writer(ofile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
item_dict = item.__dict__['_values']
row = []
for k in item_dict:
d = item_dict[k]
# Ig item is not a list join the element to string, replace all delimiters and set encoding to utf-8
if not isinstance(d, types.ListType):
value = ''.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
else:
value = ','.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
row.append(value)
writer.writerow(row)
ofile.close()
def check_header(self, what, item):
"""
Check if the file contains header elements and create if missing
:param what:
:param item:
:return:
"""
try:
with open(self.CSVPath + what + '.csv', 'ab+') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
item_dict = item.__dict__['_values']
# If file is empty, create new csv header
if os.stat(self.CSVPath + what + '.csv').st_size == 0:
self.write_csv_header(item_dict, writer)
else:
# Read first row and check header elements
read_csv = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
first_row = read_csv.next()
# if not all headers are set in the csv file, print warning
if not self.check_key_in_csv_header(item_dict, first_row):
# TODO: Add missing header to the csv file
logging.warning("Wrong headers for file " + what + ".csv")
self.CheckedHeaders.append(what)
csvfile.close()
return True
except Exception as e:
logging.error(e.message)
return False
#staticmethod
def write_csv_header(item_dict, writer):
"""
Write header of a csv file.
Header is writen from each keys in the scrapy item
:param item_dict:
:param writer:
:return:
"""
first_row = []
for k in item_dict:
# Join each Key to a string, delete delimiters and encode to utf-8
value = ''.join(k).replace('\t', '').replace('\n', '').encode('utf8')
first_row.append(value)
writer.writerow(first_row)
#staticmethod
def check_key_in_csv_header(item_dict, row):
"""
Check, for each item key, if it's contained in the first line of the csv
k (key) stands for each dictionary key of the scrapy item.
:param item_dict:
:param row:
:return:
"""
for k in item_dict:
if k not in row:
return False
return True

Related

Scrapy stops scraping yet continues to run

My project uses SerpAPI to generate a list of sites, scrapes them for any about/contact pages, and then scrapes the emails from those pages.
It had been working completely fine until I decided to pickle the list of urls generated, and then load the list into my spider.
My main.py:
# Search google using SerpAPI
search = GoogleSearch({"q": input("What are you searching? "), "location": input("Where is the location? "),
"api_key": input("What is your API key? "), "output": "html",
"num": "200", "gl": "us"})
# Filter html response for links
results = search.get_dict()
organic_results = results['organic_results']
links = []
for result in organic_results:
links.append(str(result['link']))
# Filter links to remove unwanted sites
to_remove = [
'wikipedia', 'yelp', 'google', 'britannica', 'tripadvisor', 'amazon', 'ebay', 'craigslist', 'apple',
'microsoft', 'homeadvisor', 'bing', 'businessinsider'
]
links = [i for i in links if not re.search("|".join(to_remove), i)]
set(links)
# Pickle lists and dump into separate txt files
base_path = Path(__file__).parent
file_path = (base_path / "../sites1.txt").resolve()
with open(file_path, 'wb') as fp:
pickle.dump(links, fp)
# process = CrawlerProcess(get_project_settings())
#
# process.crawl(EmailSpider)
#
# process.start()
Spider:
import pickle
import re
import tldextract
from pathlib import Path
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import EmailscrapeItem
class EmailSpider(CrawlSpider):
name = 'email'
start_urls = []
allowed_domains = []
base_path = Path(__file__).parents[2]
file_path = (base_path / "../sites1.txt").resolve()
with open(file_path, 'rb') as fp:
for i in pickle.load(fp):
start_urls.append(i)
for url in start_urls:
extracted_domain = tldextract.extract(url)
domain = "{}.{}".format(extracted_domain.domain, extracted_domain.suffix)
allowed_domains.append(domain)
rules = [
Rule(LinkExtractor(allow=r'contact/'), callback='parse'),
Rule(LinkExtractor(allow=r'contact-us/'), callback='parse'),
Rule(LinkExtractor(allow=r'about'), callback='parse'),
Rule(LinkExtractor(allow=r'about-us'), callback='parse')
]
def parse(self, response, **kwargs):
items = EmailscrapeItem()
regex = re.compile(
r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+#[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
)
# extract emails with mailto:: attachment
for res in response.xpath("//a[starts-with(#href, 'mailto')]/text()"):
items['email'] = res.get()
yield items
# extract emails using regex
html = str(response.text)
mail_list = re.findall(regex, html)
for mail in mail_list:
items['email'] = mail
yield items
And pipelines:
import re
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from scrapy.exceptions import DropItem
class EmailscrapePipeline(object):
def __init__(self):
self.exporter = None
self.email_list = set()
self.file = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('emails.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
regex = re.compile(
r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+#[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
)
if not item['email']:
raise DropItem("Item is None or empty")
if not re.search(regex, str(item['email'])):
raise DropItem("Item is not an email.")
if item['email'] in self.email_list:
raise DropItem("Duplicate item email found: %s" % item)
else:
self.email_list.add(item['email'])
return item
I have no errors that appear when I run the spider via command line.
"Most" sites return a DEBUG (200).
If anyone could point me in a good direction that'd be great. I've reduced Timeout to 15 seconds, so I'm not sure as to why it freezes.

How to download pdf files from URLs leading to sub-URLs using Python

I am trying to download all pdf files from the links in the following URLs:
https://www.adb.org/projects/documents/country/ban/year/2020?terms=education
https://www.adb.org/projects/documents/country/ban/year/2019?terms=education
https://www.adb.org/projects/documents/country/ban/year/2018?terms=education
These URLs have lists of links which directs to sub-links containing pdf files. The lists of links in the main URLs come from the search result of a country, year and a term.
I have tried with the following codes by changing it in different ways. However, it does not seem to be working. Any help would be appreciated. Thanks.
import os
import time
from glob import glob
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = ["https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"]
folder = glob("J:/pdfs/*/")
for i, folder_location in zip(url, folder):
time.sleep(1)
response = requests.get(i)
soup= BeautifulSoup(response.text, "lxml")
for link in soup.select("[href$='.pdf']"):
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(i,link['href'])).content)

Try this. It will put the files in the PDF folder.
import os
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'download_pdf'
allowed_domains = ["www.adb.org"]
start_urls = [
"https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"
] # Entry page
def __init__(self):
Spider.__init__(self, self.name) #necessary
if (not os.path.exists('./pdfs')):
os.mkdir('./pdfs')
def afterResponse(self, response, url, error=None, extra=None):
try:
path = './pdfs' + url[url.rindex('/'):]
index = path.find('?')
if index > 0: path = path[:index]
flag = utils.saveResponseAsFile(response, path, fileType="pdf")
if flag:
return None
else: # If it's not a pdf, leave it to the frame
return Spider.afterResponse(self, response, url, error)
except Exception as err:
print(err)
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lst = doc.selects('div.list >a').contains("documents/", attr="href")
if not lst:
lst = doc.selects('div.hidden-md hidden-lg >a')
urls = []
for a in lst:
a["url"] = utils.absoluteUrl(url.url, a["href"])
urls.append(a)
return {"Urls": urls}
SimplifiedMain.startThread(MySpider()) # Start download
The pdf from each url be downloaded to each separate folder.
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'download_pdf'
allowed_domains = ["www.adb.org"]
start_urls = [
"https://www.adb.org/projects/documents/country/ban/year/2020?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2019?terms=education",
"https://www.adb.org/projects/documents/country/ban/year/2018?terms=education"
] # Entry page
def afterResponse(self, response, url, error=None, extra=None):
if not extra:
print ("The version of library simplified_scrapy is too old, please update.")
SimplifiedMain.setRunFlag(False)
return
try:
path = './pdfs'
# create folder start
srcUrl = extra.get('srcUrl')
if srcUrl:
index = srcUrl.find('year/')
year = ''
if index > 0:
year = srcUrl[index + 5:]
index = year.find('?')
if index>0:
path = path + year[:index]
utils.createDir(path)
# create folder end
path = path + url[url.rindex('/'):]
index = path.find('?')
if index > 0: path = path[:index]
flag = utils.saveResponseAsFile(response, path, fileType="pdf")
if flag:
return None
else: # If it's not a pdf, leave it to the frame
return Spider.afterResponse(self, response, url, error, extra)
except Exception as err:
print(err)
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lst = doc.selects('div.list >a').contains("documents/", attr="href")
if not lst:
lst = doc.selects('div.hidden-md hidden-lg >a')
urls = []
for a in lst:
a["url"] = utils.absoluteUrl(url.url, a["href"])
# Set root url start
a["srcUrl"] = url.get('srcUrl')
if not a['srcUrl']:
a["srcUrl"] = url.url
# Set root url end
urls.append(a)
return {"Urls": urls}
# Download again by resetting the URL. Called when you want to download again.
def resetUrl(self):
Spider.clearUrl(self)
Spider.resetUrlsTest(self)
SimplifiedMain.startThread(MySpider()) # Start download

How can I start to write Unit test in web Scrapy using python?

class AljazeeraSpider(XMLFeedSpider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
'http://www.aljazeera.com/',
]
def parse(self, response):
hxs = HtmlXPathSelector(response) # The xPath selector
titles = hxs.select('//div[contains(#class,"SkyScrapperBoxes")]/div[contains(#class,"skyscLines")]')
if not titles:
MailNotify().send_mail("Aljazeera", "Scraper Report")
items = []
for titles in titles:
item = NewsItem()
item['title'] = escape(''.join(titles.select('a/text()').extract()))
item['link'] = "http://www.aljazeera.com" + escape(''.join(titles.select('a/#href').extract()))
item['description'] = ''
item = Request(item['link'], meta={'item': item}, callback=self.parse_detail)
items.append(item)
return items
def parse_detail(self, response):
item = response.meta['item']
sel = HtmlXPathSelector(response)
detail = sel.select('//td[#class = "DetailedSummary"]')
item['details'] = remove_html_tags(escape(''.join(detail.select('p').extract())))
item['location'] = ''
published_date = sel.select('//span[#id = "ctl00_cphBody_lblDate"]')
item['published_date'] = escape(''.join(published_date.select('text()').extract()))
return item
I am currently working on Scrapy to crawl the website. I have some knowledge about unittest in python. But,How can I write the unittest to check that link is working, and item['location'], item['details'] are returning the value or not? I have learned Scrapy contract but cannot understand anything.So, how can write the unittest in this case?

If we are talking specifically about how to test the spiders (not pipelines, or loaders), then what we did is provided a "fake response" from a local HTML file. Sample code:
import os
from scrapy.http import Request, TextResponse
def fake_response(file_name=None, url=None):
"""Create a Scrapy fake HTTP response from a HTML file"""
if not url:
url = 'http://www.example.com'
request = Request(url=url)
if file_name:
if not file_name[0] == '/':
responses_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(responses_dir, file_name)
else:
file_path = file_name
file_content = open(file_path, 'r').read()
else:
file_content = ''
response = TextResponse(url=url, request=request, body=file_content,
encoding='utf-8')
return response
Then, in your TestCase class, call the fake_response() function and feed the response to the parse() callback:
from unittest.case import TestCase
class MyTestCase(TestCase):
def setUp(self):
self.spider = MySpider()
def test_parse(self):
response = fake_response('input.html')
item = self.spider.parse(response)
self.assertEqual(item['title'], 'My Title')
# ...
Aside from that, you should definitely start using Item Loaders with input and output processors - this would help to achieve a better modularity and, hence, isolation - spider would just yield item instances, data preparation and modification would be incapsulated inside the loader, which you would test separately.

Scrapy :: Issues with CSV exporting

I am trying to use Scrapy to export scraped items into a CSV field with each field enclosed in double quotes. Currently, the CSV exports correctly, but when I try to modify the item fields and add double quotes manually, the CSV ends up with each field enclosed in triple double quotes. Here is an example of what I'm trying to do:
Scrapy code
import scrapy
from tutorial.items import StoreItem
class SecilSpider(scrapy.Spider):
name = "secil"
allowed_domains = ["secilstore.com"]
def start_requests(self):
start_urls = reversed(["http://www.secilstore.com/yeni_liste/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/Aksesuar_32/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33/Sayfa/{0}".format(page) for page in xrange(1,2)])
return [ scrapy.Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
item = StoreItem()
for url in response.xpath('//div[#class="image"]/a/#href').extract():
yield scrapy.Request("http://www.secilstore.com" + url, callback = self.parse)
baseUrl = response.request.headers.get('Referer', None)
if baseUrl is not None:
baseUrl = baseUrl.split('Sayfa')[0]
color = response.xpath('//a[#class="renk"]/text()').extract()
for c in color:
item['url'] = baseUrl
item['productUrl'] = response.url
item['imageUrl'] = "http://www.secilstore.com" + response.xpath('//img[#id="productMainImage"]/#src').extract()[0]
item['color'] = c
item['price'] = response.xpath('//span[#class="price cufonHover"]/text()').extract()[0] + "TL"
item['title'] = response.xpath('//h2[#class="cufon"]/text()').extract()
item['brand'] = response.xpath('//h3[#class="slogan cufonSemi"]/text()').extract()[0]
size = '|'.join(s.strip() for s in response.xpath('//a[#class="inStock"]/text()').extract())
item['size'] = size if size else -1
oldPrice = response.xpath('//div[#class="indirimFiyat"]/text()').extract()
item['oldPrice'] = oldPrice[0] + "TL" if oldPrice else -1
items.append(item)
yield item
My CSV Item Pipeline
class CSVPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('/home/ali/%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file, False,'"')
self.exporter.fields_to_export = ['url','productUrl','title','brand','imageUrl','price','oldPrice','color','size']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
So when, I try to modify a field in the spider and add double quotes manually like this (fpr example, for item['url']):
item['url'] = '"%s"' % baseUrl
the resulting CSV prints out the following:
"""http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33""",http://www.secilstore.com/urun/5905b5c6b858458df3f4851d477eec1b/Secil-Kilit-Aksesuarli-Kisa-Sapli-Canta,Kilit Aksesuarlı Kısa Saplı Çanta,Seçil,http://www.secilstore.com/_docs/i400x500/a/a1894cadeb_Kilit-Aksesuarli-Kisa-Sapli-canta.jpg,"69,90TL","159,90TL",Ekru,-1
As you can see, the first field is surrounded by triple double quotes instead of only one. Also what is interesting is that the prices are printed in double quotes. How can I surround each field with only one pair of double quotes?
Thanks!

I found it by modifying the CSVItemPipeline:
self.exporter = CsvItemExporter(open(spider.name+".csv", "w"), False,
fields_to_export=self.fields_to_export, quoting=csv.QUOTE_ALL)
This allowed me to generate a CSV file with the fields in double quotes.

Scrapy crawl in order

I can't figure out how to make scrapy crawl links in order
I've got a page with articles and in each one there is a title but the article doesn't match the title
Also in settings.py I added:
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
I've got something like this:
class Getgot(Spider):
name = "getem"
allowed_domains = ["somesite.us"]
start_urls = ["file:local.html"]
el = '//div[#article]'
def parse(self,response):
hxs = HtmlXPathSelector(response)
s = hxs.select('//article')
filename = ("links.txt")
filly = open(filename, "w")
for i in s:
t = i.select('a/#href').extract()
filly.write(str(t[0])+'\n')
yield Request(str(t[0]),callback=self.parse_page)
def parse_page(self,res):
hxs = HtmlXPathSelector(res)
s = hxs.select('//iframe').extract()
if s:
filename = ("frames.txt")
filly = open(filename, "a")
filly.write(str(s[0])+'\n')
else:
filename = ("/frames.txt")
filly = open(filename, "a")
filly.write('[]\n')

I'm not sure I understand how your question and your code are related. Where is the title ?
A few tips: 1) update your scrapy syntax with the latest version 2) don't write any files from the spider, write it in a pipeline or export feed. 3) if you need to transfer data from one function to the next, use the meta attribute.
def parse(self, response):
for link in response.xpath("//article/a/#href").extract():
yield Request(link, callback=self.parse_page, meta={'link':link})
def parse_page(self, response):
for frame in response.xpath("//iframe").extract():
item = MyItem()
item['link'] = response.meta['link']
item['frame'] = frame
yield item
And then you export it to csv or json or whatever, to store the link and the frame together.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy MultiCSVItemPipeline exports some empty items - python

Related

Scrapy stops scraping yet continues to run

How to download pdf files from URLs leading to sub-URLs using Python

How can I start to write Unit test in web Scrapy using python?

Scrapy :: Issues with CSV exporting

Scrapy crawl in order

Categories

Resources