Scraping each individual movie site in imdb using Scrapy - python

I have a csv file which contains the imdb movieID's of 300 movies. The imdb movie urls for each movie are of the format : https://www.imdb.com/title/ttmovieID
I want to scrape each movie's dedicated site for thumbnail image link,title,actors and year of release and write it to a csv file where each row will contain data for each movie,
Since I have the movieID for each movie in a csv file, what should be the start_urls of my spider and what should be the structure of my parse function? Also, how to write it to a csv file?
I have the following approach for a top 250 page of imdb. What changes should I make in the start_urls and links ?
import scrapy
import csv
from example.items import MovieItem
class ImdbSpider(scrapy.Spider):
name = "imdbtestspider"
allowed_domains = ["imdb.com"]
start_urls = ['http://www.imdb.com/chart/top',]
def parse(self,response):
links=response.xpath('//tbody[#class="lister-list"]/tr/td[#class="titleColumn"]/a/#href').extract()
i=1
for link in links:
abs_url=response.urljoin(link)
url_next='//*[#id="main"]/div/span/div/div/div[2]/table/tbody/tr['+str(i)+']/td[3]/strong/text()'
rating=response.xpath(url_next).extract()
if(i <= len(links)):
i=i+1
yield scrapy.Request(abs_url, callback=self.parse_indetail, meta={'rating' : rating })
def parse_indetail(self,response):
item = MovieItem()
item['title'] = response.xpath('//div[#class="title_wrapper"]/h1/text()').extract()[0][:-1]
item['director'] = response.xpath('//div[#class="credit_summary_item"]/span[#itemprop="director"]/a/span/text()').extract()
return item

You could just read your .csv file in start_requests function and yield requests from there. Code could be something like:
import csv
from scrapy import Request
...
def start_requests(self):
with open('imdb_ids.csv') as csv_file:
ids = csv.reader(csv_file, delimiter=',')
line = 0
for id in ids:
if line > 0:
yield Request('https://www.imdb.com/title/ttmovie' + id)
line+=1

Related

Python Scrapy grabs all the rows into one single CSV row

I am trying to generate a CSV file with Scrapy, it is working but not as expected. I have an html table which has multiple rows, I want the same in CSV. However, the following code converts all the HTML rows into single CSV row.
code
class DemoSpider(scrapy.Spider):
name = "DemoSpider"
def start_requests(self):
urls = []
for page in range(1, 2):
url = "https://directory.easternuc.com/publicDirectory?page=%s" %page
urls.append(url)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
item = TutorialItem()
item['name'] = response.selector.xpath("//tr/td/h4/text()").getall()
item['phone'] = response.selector.xpath("//tr/td[2]/text()").getall()
item['mobile'] = response.selector.xpath("//tr/td[3]/text()").getall()
item['email'] = response.selector.xpath("//tr/td[4]/text()").getall()
yield item
if I change the getall() method to get I am getting only first row from website into csv
Note: as a workaround, I can find the total rows in the website and then iterate it. However it seems like in the older version of the scrapy this is working.
You will have to iterate each tr one by one and yield each record separately
def parse(self, response):
for TR in response.xpath("//table/tr"):
item = TutorialItem()
item['name'] = TR.xpath("./td/h4/text()").get()
item['phone'] = TR.xpath("./td[2]/text()").get()
item['mobile'] = TR.xpath("./td[3]/text()").get()
item['email'] = TR.xpath("./td[4]/text()").get()
yield item

1: my spider is giving me all the results in one liners on csv file

In the first place, If I use extract_first, scrapy gives me the first element of each page and if I run it like this it returns all the content I want but in one-liners.
In Second place, I can't make scrapy go to the links I just scraped and get information from inside these links, returning an empty csv file.
from scrapy import Spider
from companies.items import CompaniesItem
import re
class companiesSpider(Spider):
name = "companies"
allowed_domains = ['http://startup.miami',]
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/page/" + str(1*i) + "/" for i in range(0, 10)]
def parse(self, response):
rows = response.xpath('//*[#id="datafetch"]')
for row in rows:
link = row.xpath('.//h2/a/#href').extract()
name = row.xpath('.//header/h2/a/text()').extract()
item = CompaniesItem()
item['link'] = link
item['name'] = name
yield item
Your parse-method is not yielding any requests or items. In the part below we go through the pages and get the urls & names. In the parse_detail you can add additional data to the item.
Instead of hardcoding to 10 pages we check if there is a next page, and go through the parse again if it's the case.
from scrapy import Spider
from ..items import CompaniesItem
import scrapy
class CompaniesSpider(Spider):
name = "companies"
allowed_domains = ['startup.miami']
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/"]
def parse(self, response):
# get link & name and send item to parse_detail in meta
rows = response.xpath('//*[#id="datafetch"]/article')
for row in rows:
link = row.xpath('.//#href').extract_first()
name = row.xpath(
'.//*[#class="textoCoworking"]/text()').extract_first()
item = CompaniesItem()
item['link'] = link
item['name'] = name.strip()
yield scrapy.Request(link,
callback=self.parse_detail,
meta={'item': item})
# get the next page
next_page = response.xpath(
'//*[#class="next page-numbers"]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item']
# add other details to the item here
yield item
To put the results in a csv file you can launch the scraper like this: scrapy crawl companies -o test_companies.csv

How to loop through multiple URLs to scrape from a CSV file in Scrapy?

My code for scrapping data from alibaba website:
import scrapy
class IndiamartSpider(scrapy.Spider):
name = 'alibot'
allowed_domains = ['alibaba.com']
start_urls = ['https://www.alibaba.com/showroom/acrylic-wine-box_4.html']
def parse(self, response):
Title = response.xpath('//*[#class="title three-line"]/a/#title').extract()
Price = response.xpath('//div[#class="price"]/b/text()').extract()
Min_order = response.xpath('//div[#class="min-order"]/b/text()').extract()
Response_rate = response.xpath('//i[#class="ui2-icon ui2-icon-skip"]/text()').extract()
for item in zip(Title,Price,Min_order,Response_rate):
scraped_info = {
'Title':item[0],
'Price': item[1],
'Min_order':item[2],
'Response_rate':item[3]
}
yield scraped_info
Notice the start url, it only scraps through the given URL, but i want this code to scrap all the urls present in my csv file. My csv file contains large amount of URLs.
Sample of the data.csv file::
'https://www.alibaba.com/showroom/shock-absorber.html',
'https://www.alibaba.com/showroom/shock-wheel.html',
'https://www.alibaba.com/showroom/shoes-fastener.html',
'https://www.alibaba.com/showroom/shoes-women.html',
'https://www.alibaba.com/showroom/shoes.html',
'https://www.alibaba.com/showroom/shoulder-long-strip-bag.html',
'https://www.alibaba.com/showroom/shower-hair-band.html',
...........
How do i import all the links of csv file in the code at once?
To correctly loop through a file without loading all of it into memory you should use generators, as both file objects and start_requests method in python/scrapy are generators:
class MySpider(Spider):
name = 'csv'
def start_requests(self):
with open('file.csv') as f:
for line in f:
if not line.strip():
continue
yield Request(line)
To explain futher:
Scrapy engine uses start_requests to generate requests as it goes. It will keep generating requests untill concurrent request limit is full (settings like CONCURRENT_REQUESTS).
Also worth noting that by default scrapy crawls depth first - newer requests take priority, so start_requests loop will be last to finish.
You're almost there already. The only change is in start_urls, which you want to be "all the urls in the *.csv file." The following code easily implements that change.
with open('data.csv') as file:
start_urls = [line.strip() for line in file]
Let us assume you have stored url list in the form of a dataframe and you want to loop over each URL present inside the dataframe. My approach is given below which worked for me.
class IndiamartSpider(scrapy.Spider):
name = 'alibot'
#allowed_domains = ['alibaba.com']
#start_urls = ['https://www.alibaba.com/showroom/acrylic-wine-box_4.html']
def start_requests(self):
df = pd.read_csv('fileContainingUrls.csv')
#Here fileContainingUrls.csv is a csv file which has a column named as 'URLS'
# contains all the urls which you want to loop over.
urlList = df['URLS'].to_list()
for i in urlList:
yield scrapy.Request(url = i, callback=self.parse)
def parse(self, response):
Title = response.xpath('//*[#class="title three-line"]/a/#title').extract()
Price = response.xpath('//div[#class="price"]/b/text()').extract()
Min_order = response.xpath('//div[#class="min-order"]/b/text()').extract()
for item in zip(Title,Price,Min_order,Response_rate):
scraped_info = {
'Title':item[0],
'Price': item[1],
'Min_order':item[2],
'Response_rate':item[3]
}
yield scraped_info

Crawl website from list of values using scrapy

I have a list of NPIs which I want to scrape the names of the providers for from npidb.org
The NPI values are stored in a csv file.
I am able to do it manually by pasting the URLs in the code. However, I am unable to figure out how to do it if I have a list of NPIs for each of which I want the provider names.
Here is my current code:
import scrapy
from scrapy.spider import BaseSpider
class MySpider(BaseSpider):
name = "npidb"
def start_requests(self):
urls = [
'https://npidb.org/npi-lookup/?npi=1366425381',
'https://npidb.org/npi-lookup/?npi=1902873227',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-1]
filename = 'npidb-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
Assume you have a list of npi from csv file, then you can simply use format to change the website address as following(I also add the part to get list from csv file. If you have it already, you can omit that part):
def start_requests(self):
# get npis from csv file
npis = []
with open('test.csv', 'r') as f:
for line in f.readlines():
l = line.strip()
npis.append((l))
# generate the list of address depending on npi
start_urls = []
for npi in npis:
start_urls.append('https://npidb.org/npi-lookup/?npi={}'.format(npi))
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse)
Well, it depends on the structure of your csv file, but if it contains the npis in separate lines, you could do something like
def start_requests(self):
with open('npis.csv') as f:
for line in f:
yield scrapy.Request(
url='https://npidb.org/npi-lookup/?npi={}'.format(line.strip()),
callback=self.parse
)

Scrapy - Importing Excel .csv as start_url

So I'm building a scraper that imports a .csv excel file which has one row of ~2,400 websites (each website is in its own column) and using these as the start_url. I keep getting this error saying that I am passing in a list and not a string. I think this may be caused by the fact that my list basically just has one reallllllly long list in it that represents the row. How can I overcome this and basically put each website from my .csv as its own seperate string within the list?
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
exceptions.TypeError: Request url must be str or unicode, got list:
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http import HtmlResponse
from tutorial.items import DanishItem
from scrapy.http import Request
import csv
with open('websites.csv', 'rbU') as csv_file:
data = csv.reader(csv_file)
scrapurls = []
for row in data:
scrapurls.append(row)
class DanishSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = []
start_urls = scrapurls
def parse(self, response):
for sel in response.xpath('//link[#rel="icon" or #rel="shortcut icon"]'):
item = DanishItem()
item['website'] = response
item['favicon'] = sel.xpath('./#href').extract()
yield item
Thanks!
Joey
Just generating a list for start_urls does not work as it is clearly written in Scrapy documentation.
From documentation:
You start by generating the initial Requests to crawl the first URLs, and specify a callback function to be called with the response downloaded from those requests.
The first requests to perform are obtained by calling the
start_requests() method which (by default) generates Request for
the URLs specified in the start_urls and the parse method as
callback function for the Requests.
I would rather do it in this way:
def get_urls_from_csv():
with open('websites.csv', 'rbU') as csv_file:
data = csv.reader(csv_file)
scrapurls = []
for row in data:
scrapurls.append(row)
return scrapurls
class DanishSpider(scrapy.Spider):
...
def start_requests(self):
return [scrapy.http.Request(url=start_url) for start_url in get_urls_from_csv()]
I find the following useful when in need:
import csv
import scrapy
class DanishSpider(scrapy.Spider):
name = "rei"
with open("output.csv","r") as f:
reader = csv.DictReader(f)
start_urls = [item['Link'] for item in reader]
def parse(self, response):
yield {"link":response.url}
Try opening the .csv file inside the class (not outside as you have done before) and append the start_urls. This solution worked for me. Hope this helps :-)
class DanishSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = []
start_urls = []
f = open('websites.csv'), 'r')
for i in f:
u = i.split('\n')
start_urls.append(u[0])
for row in data:
scrapurls.append(row)
row is a list [column1, column2, ..]
So I think you need to extract the columns, and append to your start_urls.
for row in data:
# if all the column is the url str
for column in row:
scrapurls.append(column)
Try this way also,
filee = open("filename.csv","r+")
# Removing the \n 'new line' from the url
r=[i for i in filee]
start_urls=[r[j].replace('\n','') for j in range(len(r))]

Categories