scrapy HTML Response content isn't text - python

This is my code that scans a user and outputs their SteamID and the value of their inventory:
import scrapy
import logging
bot_words = [
"bot",
"BOT",
"Bot",
"[tf2mart]"
]
class AccountSpider(scrapy.Spider):
name = "accounts"
start_urls = [
'file:///Users/max/Documents/promotebot/tutorial/tutorial/TF2ITEMS.htm'
]
def linkgen(self):
global steamid
print("Downloading Page...")
yield scrapy.Request("http://www.backpack.tf" + steamid, callback=self.parse_accounts)
print("Page successfully downloaded.")
def parse(self, response):
global steamid
lgen = self.linkgen()
for tr in response.css("tbody"):
for user in response.css("span a"):
if bot_words not in response.css("span a"):
print("Parsed info")
print("User: " + user.extract())
steamid = user.css('::attr(href)').extract()[0]
print("Steam ID: " + steamid)
yield lgen.next()
def parse_accounts(self, response):
print("Value finding function activted.")
#print(response.headers, response.body)
print(response.css("head"))
for description in response.css("head"):
print("level 1 value")
value = response.css("description.content").extract()
print(value)
Expected output is:
Parsed info
User: user
Steam ID: /profiles/76561198017108***
(SOME VALUE)
Current output is:
2018-06-15 15:08:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET
file:///Users/max/Documents/promotebot/tutorial/tutorial/TF2ITEMS.htm> (referer: None)
Parsed info
User: cakedog
Steam ID: /profiles/76561198017108466
Downloading Page...
2018-06-15 15:08:12 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.backpack.tf/robots.txt> from <GET http://www.backpack.tf/robots.txt>
2018-06-15 15:08:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.backpack.tf/robots.txt> (referer: None) ['cached']
2018-06-15 15:08:12 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.backpack.tf/profiles/76561198017108466> from <GET http://www.backpack.tf/profiles/76561198017108466>
2018-06-15 15:08:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.backpack.tf/profiles/76561198017108466> (referer: None) ['cached']
Parsed info
User: Jarvis Frapner
Steam ID: /profiles/76561198015589573
Page successfully downloaded.
Value finding function activted.
2018-06-15 15:08:12 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.backpack.tf/profiles/76561198017108466> (referer: None)
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/max/Documents/promotebot/tutorial/tutorial/spiders/accounts_spider.py", line 38, in parse_accounts
print(response.css("head"))
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/http/response/__init__.py", line 99, in css
raise NotSupported("Response content isn't text")
NotSupported: Response content isn't text
Despite the multithreading(the linkgen generator downloading the request while the parse function is activating it again), the function should still work(?) I can't seem to convert the HTTP response to a text object.

Related

Scrapy, could not get the next page

I'm working on Google search results crawling through this link.
https://www.google.com/search?q=telsa+"model3"+intext:model3&hl=en&rlz
I have disabled Javascript in the Chrome Browser and obtained the xpath value for Next.
xpath -> //*[#id="main"]/footer/div1/div/div/a
Here is my code
import scrapy
from ..items import GooglescrapyItem
from urllib.parse import urlparse, parse_qs
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = [f'https://www.google.com/search?q=telsa+"model3"+intext:model3&hl=en&rlz']
def parse(self, response):
titles = response.xpath('//*[#id="main"]/div/div/div/a/h3/div//text()').extract()
links = response.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
next_page = response.xpath('//*[#id="main"]/footer/div/div/div/a/#href').extract()
items = []
for idx in range(len(titles)):
item = GooglescrapyItem()
item['title'] = titles[idx]
parsed_url = urlparse(links[idx])
query_params = parse_qs(parsed_url.query)
item['link'] = query_params["q"][0]
items.append(item)
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.google.com/search?q=telsa+"model3"+intext:model3&hl=en&rlz' + next_href
request = scrapy.Request(url=next_page_url)
yield request
return items
output
DEBUG: Crawled (200) <GET https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz> (referer: None)
DEBUG: Crawled (200) <GET https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=LMrhYP3IOY6v0PEPmKGNoAg&start=10&sa=N> (referer: https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz)
DEBUG: Crawled (200) <GET https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=LsrhYIf-AdSTr7wPtt-LyA4&start=0&sa=N> (referer: https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=LMrhYP3IOY6v0PEPmKGNoAg&start=10&sa=N)
DEBUG: Crawled (200) <GET https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=L8rhYJCNCI7_0gSA5qKAAg&start=10&sa=N> (referer: https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=LsrhYIf-AdSTr7wPtt-LyA4&start=0&sa=N)
DEBUG: Crawled (200) <GET https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=MMrhYOLRHeLFmAX2w4ioBA&start=0&sa=N> (referer: https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=L8rhYJCNCI7_0gSA5qKAAg&start=10&sa=N)
...Skip
Question
Google Search Pages will increase by 10 steps, such as &start=0 &start=10 &start=20 &start=30.
However, my code repeats &start=0 and &start=10 and does not move to &start=20
Could you please go over my code?
Try nextPage = response.xpath('//td[#role="heading"]/a/#href').get()

Scrapy follows link but does not return data, possible timing issue?

I have tried several settings such as delaying the download time, the console does not seem to have an error, and the selectors return the correct data from Scrapy Shell
The site uses a different prefix on the domain, could this be the cause? slist.amiami.jp
I tried several variations of domains and URLs but all result in the same response of no data returned
Any Idea why it is not collecting any data for the -o CSV file? Thank you if you have any advise
The expected output is to return the JAN code and category text from the product page
2021-05-13 23:59:35 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2021-05-13 23:59:35 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6026
2021-05-13 23:59:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://example.jp/top/search/list?s_keywords=4967834601246> (referer: None)
2021-05-13 23:59:46 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.example.jp/top/detail/detail?gcode=TOY-SCL-05454> (referer: https://example.jp/top/search/list?s_keywords=4967834601246)
2021-05-13 23:59:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://example.jp/top/search/list?s_keywords=4543736302216> (referer: None)
2021-05-14 00:00:04 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://example.jp/top/search/list?s_keywords=44536318620013> (referer: None)
2021-05-14 00:00:04 [scrapy.core.engine] INFO: Closing spider (finished)
2021-05-14 00:00:04 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1115,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 4,
'elapsed_time_seconds': 29.128242,
'finish_reason': 'finished',
import scrapy
class exampledataSpider(scrapy.Spider):
name = 'example'
allowed_domains = ['example.jp']
start_urls = ['https://example.jp/top/search/list?s_keywords=4967834601246',
'https://example.jp/top/search/list?s_keywords=4543736302216',
'https://example.jp/top/search/list?s_keywords=44536318620013',
]
def parse(self, response):
for link in response.css('div.product_box a::attr(href)'):
yield response.follow(link.get(), callback=self.item)
def item(self, response):
products = response.css('div.maincontents')
for product in products:
yield {
'JAN': product.css('dd.jancode::text').getall(),
'title': product.css('div.pankuzu a::text').getall()
}
It seems the products = response.css('div.maincontents') selector was incorrect and I had to do 2 separate parent child requests for the data
It also turns out you can simply just YEILD the elements in a list
'''
def output(self, response):
yield {
'firstitem': response.css('example td:nth-of-type(2)::text').getall(),
'seconditem': response.css('example td:nth-of-type(2)::text').getall(),
'thrditem': response.css('example td:nth-of-type(2)::text').getall()
}
'''

Scrapy LinkExtractor fails to find existing url

I have a Crawler like this:
class SkySpider(CrawlSpider):
name = "spider_v1"
allowed_domains = [
"atsu.edu",
]
start_urls = [
"http://www.atsu.edu",
]
rules = (
Rule(
INFO_LINKS_EXTRACTOR,
follow=True,
callback='parse_item',
),
)
def parse_item(self, response):
print("ENTERED!")
item = SportsScraperItem()
item["contact"] = self._parse_contact(response)
return item
In my helpers.py I have:
from scrapy.linkextractors import LinkExtractor
def _r(string):
return f"(.*?)(\b{string}\b)(.*)"
INFO_LINKS_EXTRACTOR = LinkExtractor(
allow=(
_r('about'),
),
unique=True,
)
I know that atsu.edu has a link https://www.atsu.edu/about-atsu/, but my extractor seems like does not see it and parse_item() method is not run. What am I doing wrong here?
EDIT 1:
Logs:
2019-10-01 15:40:58 [scrapy.core.engine] INFO: Spider opened
2019-10-01 15:40:58 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-10-01 15:40:58 [steppersspider_v1] INFO: Spider opened: steppersspider_v1
2019-10-01 15:40:58 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2019-10-01 15:40:59 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.atsu.edu/robots.txt> from <GET http://WWW.ATSU.EDU/robots.txt>
2019-10-01 15:41:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.atsu.edu/robots.txt> (referer: None)
2019-10-01 15:41:11 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.atsu.edu/> from <GET http://WWW.ATSU.EDU>
2019-10-01 15:41:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.atsu.edu/robots.txt> (referer: None)
2019-10-01 15:41:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.atsu.edu/> (referer: None)
2019-10-01 15:41:19 [steppersspider_v1] DEBUG: Saved file steppers-www.atsu.edu.html
2019-10-01 15:41:20 [scrapy.core.engine] INFO: Closing spider (finished)
2019-10-01 15:41:20 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
EDIT 2
Here is how I tested this regexp on regexp101.com.
EDIT 3
Working function for regexp:
def _r(string):
return r"^(.*?)(\b{string}\b)(.*)$".format(string=string)
By default, link extractors only search for a and area tags. The links you are looking for seem to be in li tags.
You need to pass the tags parameter to the constructor of your link extractor with the desired tags. For example:
tags=('a', 'area', 'li')
See https://doc.scrapy.org/en/latest/topics/link-extractors.html#module-scrapy.linkextractors.lxmlhtml

Call back functions are not fired

I am trying to scrape MichaelKors.com. I have had success until now; my script just stopped working. Call back functions are not being fired. I have removed everything from my functions and even then they are not being called. Here is my code:
class MichaelKorsClass(CrawlSpider):
name = 'michaelkors'
allowed_domains = ['www.michaelkors.com']
start_urls = ['https://www.michaelkors.com/women/clothing/dresses/_/N-28ei' ]
rules = (
# Rule(LinkExtractor(allow=('(.*\/_\/R-\w\w_)([\-a-zA-Z0-9]*)$', ), deny=('((.*investors.*)|(/info/)|(contact\-us)|(checkout))', )), callback='parse_product'),
Rule(LinkExtractor(allow=('(.*\/_\/)(N-[\-a-zA-Z0-9]*)$',),
deny=('((.*investors.*)|(/info/)|(contact\-us)|(checkout) | (gifts))',),), callback='parse_list'),
)
def parse_product(self, response):
self.log("HIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
def parse_list(self, response):
hxs = HtmlXPathSelector(response)
url = response.url
self.log("Helloww")
is_listing_page = False
product_count = hxs.select('//span[#class="product-count"]/text()').get()
#print(re.findall('\d+', pc))
try:
product_count = int(product_count)
is_listing_page = True
except:
is_listing_page = False
if is_listing_page:
for product_url in response.xpath('//ul[#class="product-wrapper product-wrapper-four-tile"]//li[#class="product-name-container"]/a/#href').getall():
yield scrapy.Request(response.urljoin(product_url), callback=self.parse_product)
And here is the log:
2019-07-29 11:25:50 [scrapy.core.engine] INFO: Spider opened
2019-07-29 11:25:50 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-07-29 11:25:50 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2019-07-29 11:25:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.michaelkors.com/women/clothing/dresses/_/N-28ei> (referer: None)
2019-07-29 11:25:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.michaelkors.com/sale/view-all-sale/_/N-28zn> (referer: https://www.michaelkors.com/women/clothing/dresses/_/N-28ei)
2019-07-29 11:25:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.michaelkors.com/women/clothing/jumpsuits/_/N-18bkjwa> (referer: https://www.michaelkors.com/women/clothing/dresses/_/N-28ei)
2019-07-29 11:25:59 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.michaelkors.com/women/clothing/t-shirts-sweatshirts/_/N-10dkew5> (referer: https://www.michaelkors.com/women/clothing/dresses/_/N-28ei)
....
Neither Hellow nor Hiiii is printed
Edit1: Copied my script to another project and it works fine. Still don't know what the problem was.

Scrapy - POST request is called on referred url not initial one

I am submitting a FormRequestto change the page number of multiple pages of results.
When I use the scrapy shell, the Post request goes through:
> `2017-05-21 22:44:19 [scrapy.core.engine] INFO: Spider opened
> 2017-05-21 22:44:20 [scrapy.core.engine] DEBUG: Crawled (200) <GET
> http://www.australianschoolsdirectory.com.au/robots.txt> (referer:
> None) 2017-05-21 22:44:22 [scrapy.core.engine] DEBUG: Crawled (200)
> <POST http://www.australianschoolsdirectory.com.au/search-result.php>
> (referer: None) True 2017-05-21 22:44:27 [scrapy.core.engine] DEBUG:
> Crawled (200) <POST
> http://www.australianschoolsdirectory.com.au/search-result.php>
> (referer: None) True 2017-05-21 22:44:39 [scrapy.core.engine] DEBUG:
> Crawled (200) <POST
> http://www.australianschoolsdirectory.com.au/search-result.php>
> (referer: None) True 2017-05-21 22:44:43 [scrapy.core.engine] DEBUG:
> Crawled (200) <POST
> http://www.australianschoolsdirectory.com.au/search-result.php>
> (referer: None) True 2017-05-21 22:44:46 [scrapy.core.engine] DEBUG:
> Crawled (200) <POST
> http://www.australianschoolsdirectory.com.au/search-result.php>
> (referer: None) True`
Using this request sequence:
>>> from scrapy.http import FormRequest
>>> url = 'http://www.australianschoolsdirectory.com.au/search-result.php'
>>> for i in range(1, 6):
... payload={'pageNum': str(i)}
... r = FormRequest(url, formdata=payload)
... fetch(r)
... view(response)
But when I implement the postrequest into my scrapy code the post is referred back to the initial search site.
`2017-05-21 22:58:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.australianschoolsdirectory.com.au/robots.txt> (referer: None)
2017-05-21 22:58:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.australianschoolsdirectory.com.au/search-result.php> (referer: None)
2017-05-21 22:58:46 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://www.australianschoolsdirectory.com.au/**search.php>** (referer: http://www.australianschoolsdirectory.com.au/search-result.php)`
Of course the search.php doesn't have the data I'm looking for. Why is this Post in my code refering it back to search and not in the shell? And how can I stop the referral while still going to the next set of results?
Scrapy code:
from scrapy.http import FormRequest
from scrapy.spiders import Spider
class Foo(Spider):
name = "schoolsTest"
allowed_domains = ["australianschoolsdirectory.com.au"]
start_urls = ["http://www.australianschoolsdirectory.com.au/search-result.php"]
def parse(self, response):
yield FormRequest.from_response(response, formdata={'pageNum': str(5), 'search': 'true'}, callback=self.parse1)
def parse1(self, response):
print response.url
First of all, you don't need to use from_response (since you not dealing with form) and you can use a scrapy start_requests method:
import scrapy
class Foo(scrapy.Spider):
name = "schoolsTest"
def start_requests(self):
url = "http://www.australianschoolsdirectory.com.au/search-result.php"
# Change 5 to 488 to parse all search result
for i in range(1, 5):
payload = {'pageNum': str(i)}
yield scrapy.FormRequest(url, formdata=payload)
def parse(self, response):
# Extract all links from search page and make absolute urls
links = response.xpath('//div[#class="listing-header"]/a/#href').extract()
for link in links:
full_url = response.urljoin(link)
# Make a Request to each detail page
yield scrapy.Request(full_url, callback=self.parse_detail)
def parse_detail(self, response):
print(response.url)

Categories