urlopen error when scrapy spider is run from a script

urlopen error when scrapy spider is run from a script - python

I've written a script that runs a scrapy spider that is located inside a different directory. The script takes in user input, parses it and adds it to a url to be scraped. The script seemed to be working earlier but now I'm getting the following error:
URLError: <urlopen error [Errno 101] Network is unreachable>
ERROR: Unable to read instance data, giving up
The code for the spider works properly when run with the scrapy crawl command, but isn't working when run from a script for some reason.
Here is the code for the function that runs the spider from the script (located within the spider file):
def spiderCrawl(bandname):
aSpider = MySpider3()
aSpider.create_link(bandname)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(aSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
function that creates the url:
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
start_urls = [tc_url]
Also, below is an image of the terminal with the error message. The fact that a random bandname was entered suggests that the url wasn't even read in the first place. What could be the problem, here? Any help would be appreciated, thanks.
Update:
So it seems that the problem was that my create_link method inside of the spider class wasn't properly adding the link to the start_urls list, but the script does seem to be running the spider when I use the raw_input statement inside of the spider file as opposed to the script. What would be the proper way to pass the argument of the user's input to the spider file to be added as a link? I have the code for the spider and the script running the spider below to make the post more complete:
script code
from ticket_city_scraper.ticket_city_scraper import *
from ticket_city_scraper.ticket_city_scraper.spiders import tc_spider
bandname = raw_input("Enter bandname\n") # I took out this line and added it to the spider file to make the script work
tc_spider.spiderCrawl(bandname)
spider file
class MySpider3(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[#class = "vevent"]'
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
self.start_urls = [tc_url]
#return tc_url
tickets_list_xpath = './/div[#class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[#itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[#itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[#class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[#class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[#class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[#class="divEventDetails url"]/#href')
#loader.add_xpath('eventDateTime' , '//div[#id="divEventDate"]/#title') #datetime type
#loader.add_xpath('eventTime' , './/*[#class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[#id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
def spiderCrawl(bandname):
# process = CrawlerProcess({
# 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
# })
# process.crawl(aSpider)
# process.start()
aSpider = MySpider3()
#aSpider.create_link(bandname)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(aSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()

I could only guess since you didn't provide a MCVE. However I'd say in your function create_link, this line:
start_urls = [tc_url]
should really be:
self.start_urls = [tc_url]

Related

How to run Scrapy in a while loop

So Im doing a project scraping different websites using multiple spiders. I want to make it so that the spiders run again when the user says "Yes" when asked to continue.
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
But I get an error saying reactor is not restartable.
Traceback (most recent call last):
File "/Users/user/Desktop/programs/eshopSpider/eshopSpider.py", line 47, in <module>
process.start()
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/crawler.py", line 327, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1317, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1299, in startRunning
ReactorBase.startRunning(cast(ReactorBase, self))
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 843, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
So I guess using while loop is no-go. I don't know where to even start...

Method 1:
scrapy creates Reactor which can't be reused after stop but if you will run Crawler in separated process then new process will have to create new Reactor.
import multiprocessing
def run_crawler(keyword, page_range):
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
# --- main ---
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
p = multiprocessing(target=run_crawler, args=(keyword, page_range))
p.start()
p.join()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
It will not work if you use threading instead of multiprocessing because threads share variables so new thread will use the same Reactor as previous thread.
Minimal working code (tested on Linux).
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
import multiprocessing
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page_range):
#from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
p = multiprocessing.Process(target=run_crawler, args=(keyword, page))
p.start()
p.join()
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
Method 2:
Found in Google: Restarting a Twisted Reactor.
It is old post which uses del to remove module twisted from memory and later it imports it again.
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
Minimal working code (tested on Linux)
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page):
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
run_crawler(keyword, page)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
Method 3:
It seems you could use use CrawlRunner instead of CrawlProcess - but I didn't test it yet.
Base on last example in doc for Running multiple spiders in the same process I created code which runs while-loop inside reactor (so it doesn't have to stop it) but it first starts one Spider, next runs second Spider, next it asks for contiuation and it runs again first Spider, next runs second Spider. It doesn't runs both Spiders at the same time but maybe it could be somehow changed.
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
#defer.inlineCallbacks
def run_crawler():
running = True
while running:
yield runner.crawl(MySpider, keyword, page)
yield runner.crawl(MySpider, keyword, int(page)+1)
yield runner.crawl(MySpider, keyword, int(page)+2)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
#return
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
configure_logging()
runner = CrawlerRunner({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
run_crawler()
reactor.run()
EDIT:
The same but now all crawlers run at the same time
#defer.inlineCallbacks
def run_crawler():
running = True
while running:
runner.crawl(MySpider, keyword, page)
runner.crawl(MySpider, keyword, int(page)+1)
runner.crawl(MySpider, keyword, int(page)+2)
d = runner.join()
yield d
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
#return

You can remove the while loop and use callbacks instead.
Edit: Example added:
def callback_f():
# stuff #
calling_f()
def calling_f():
answer = input("Continue? (y/n)")
if not answer == 'n':
callback_f()
callback_f()

from twisted.internet import reactor #only this is supposed to be here, we will be deleting the reactor after each run, using the main
configure_logging()
settings = get_project_settings()
runner = CrawlerRunner(settings)
d = runner.crawl('your spider class name')
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished
del sys.modules['twisted.internet.reactor'] #deleting the reactor, because we want to run a for loop, the reactor will be imported again at the top
default.install()

Use scrapy from a py file (not in command line)

Trying to launch scrapy from a py file with this command :
py myproject.py -f C:\Users\admin\Downloads\test.csv
Here my file named "myproject.py"
import spiders.ggspider as MySpiders
# Return array
dataFile = args.file
myData = CSVReader.getAnimalList(dataFile)
leSpider = MySpiders.GGCSpider()
leSpider.myList = myData
leSpider.start_requests()
Here my spider file :
import scrapy
import urllib
class GGSpider(scrapy.Spider):
name = "spiderman"
domain = "https://www.google.fr/?q={}"
myList = []
def __init__(self):
pass
def start_requests(self):
for leObject in self.myList:
tmpURL = self.domain.format(urllib.parse.urlencode({'text' : leObject[0]}))
yield scrapy.Request(url=self.domain+leObject[0],callback = self.parse)
def parse(self, response):
print('hello')
print(response)
My problem is : I go into start_requests, because I put a print before the yield and got the print in console
But the callback seems to not append (I don't get the 'Hello' print).
I really don't know why (I'm new to Python, maybe I'm missing something obvious)

I guess that's because generator doesn't actually runs before you'll retrieve its values. You could try to consume generator somehow:
import spiders.ggspider as MySpiders
# Return array
dataFile = args.file
myData = CSVReader.getAnimalList(dataFile)
leSpider = MySpiders.GGCSpider()
leSpider.myList = myData
for request in leSpider.start_requests():
do_something(request)
UPD: Here is a better example of running Spider from a script:
import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider(scrapy.Spider):
# Your spider definition
...
process = CrawlerProcess(settings={
"FEEDS": {
"items.json": {"format": "json"},
},
})
process.crawl(MySpider)
process.start() # the script will block here until the crawling is finished

Can not call build_in function

In the following program, I tried to call self.start_requests() in self.after_login(), but not succeed. I rewrite the content of function self.start_requests() instead and it works.
My question is that I don't understand why I can just directly call function self.start_requests()?
__author__ = 'parallels'
import scrapy
from scrapy import Request
from bs4 import BeautifulSoup
def start_requests(usrname, password):
return Request(url="http://www.heibanke.com/lesson/crawler_ex01/",
cookies={'name':usrname, 'password':password},dont_filter = True)
class heibanke2(scrapy.Spider):
name = "herbanke2"
# start_urls = ["http://www.heibanke.com/lesson/crawler_ex01/"]
password = 4
def start_requests(self):
return [Request("http://www.heibanke.com/lesson/crawler_ex01/", callback = self.post_login,dont_filter = True)]
#FormRequeset
def post_login(self, response):
print 'Preparing login'
print "current password:" , str(self.password)
return [scrapy.FormRequest.from_response(response,
formdata = {
'username': "JoseLyn",
'password': str(self.password)
},
callback = self.after_login
)]
def after_login(self,response):
print "after_login"
with open("body" + str(self.password),"wb") as f:
f.write(response.body)
soup = BeautifulSoup(response.body,"lxml")
if "JoseLyn" not in soup.h3.string:
self.password += 1
# self.start_requests()
return [Request("http://www.heibanke.com/lesson/crawler_ex01/", callback = self.post_login,dont_filter = True)]
else:
print "password found:", str(self.password)
print "next mission at:", 'http://www.heibanke.com' + soup.a['href']
Thank you in advance!

when writing scrapy pipelines data into database, the error is too many values to unpack

Good afternoon,
I try to fetch data from web and store in SQLServer. With lib pymssql, the connection has been established. But when process item, the error "too many values to unpack" comes out, so I also attached the MyItem class. I can't see obvious mistake?
here is the code in pipelines.py
-*- coding: utf-8 -*-
import pymssql
from scrapy import signals
import json
import codecs
class MyPipeline(object):
def __init__(self):
self.conn = pymssql.connect(host=r".\\MyPC",user='sa',password='XXXX',database='Webmining')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.executemany("INSERT INTO RecruitInformation(recruitNumber,name,detailLink,publishTime,catalog,worklocation) VALUES (%d,%s,%s,%t,%s,%s)",(item['recruitNumber'],item['name'],item['detailLink'],item['publishTime'],item['catalog'],item['worklocation']))
self.conn.commit()
except pymssql.InterfaceError, e:
print ("pymssql.InterfaceError")
except pymssql.DataError, e:
print ("pymssql.DataError")
except pymssql.OperationalError, e:
print ("pymssql.OperationalError")
except pymssql.IntegrityError, e:
print ("pymssql.IntegrityError")
except pymssql.InternalError, e:
print ("pymssql.InternalError")
except pymssql.ProgrammingError, e:
print ("pymssql.ProgrammingError")
except pymssql.NotSupportedError, e:
print ("pymssql.NotSupportedError")
return item
def spider_closed(self, spider):
self.conn.close()
//the code in item.py is as follow
import scrapy
from scrapy.item import Item, Field
class MyItem(Item):
name = Field()
catalog = Field()
workLocation = Field()
recruitNumber = Field()
detailLink = Field()
publishTime = Field()
class MySpider(CrawlSpider):
name = "xxxx"
allowed_domains = ["xxxx.com"]
start_urls = [ "http://xx.xxxx.com/position.php"]
rules = [Rule(sle(allow=("/position.php\?&start=\d{,4}#a")), follow=True,callback='parse_item')]
def parse_item(self, response):
items = []
sel = Selector(response)
base_url = get_base_url(response)
sites_even = sel.css('table.tablelist tr.even')
for site in sites_even:
item = MyItem()
item['name'] = site.css('.l.square a').xpath('text()').extract()
relative_url = site.css('.l.square a').xpath('#href').extract()[0]
item['detailLink'] = urljoin_rfc(base_url, relative_url)
item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()
item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()
item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()
items.append(item)
sites_odd = sel.css('table.tablelist tr.odd')
for site in sites_odd:
item = MyItem()
item['name'] = site.css('.l.square a').xpath('text()').extract()
relative_url = site.css('.l.square a').xpath('#href').extract()[0]
item['detailLink'] = urljoin_rfc(base_url, relative_url)
item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()
item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()
item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()
items.append(item)
return items
def _process_request(self, request):
info('process ' + str(request))
return request

try use self.cursor.execute instead of self.cursor.executemany in your code.

Getting 403 error when trying to parse dropbox events page with python and mechanize

I use this script to get a list of all file updates to a certain directory. I then parse that list to get a list of time slots I have been active in that directory. That way I can quickly see how much time I have spent on the project and know what to charge my client.
I have written a small python script, adapted from this: https://github.com/jncraton/PythonDropboxUploader
I added the bottom function to retrieve a specific events page from https://www.dropbox.com/events?ns=false&n=50
I have used the script before 2 months ago and it worked well, but now I am getting 403: forbidden errors on:
eventSrc = self.browser.open(req).read()
Probably DropBox tries to block scrapers like mine to push programmers to use their API instead, but unfortunately the API doesn't support listing the events.
Can anybody help me out to get it working again?
This is the python code to create the connection:
import mechanize
import urllib
import re
import json
class DropboxConnection:
""" Creates a connection to Dropbox """
email = ""
password = ""
root_ns = ""
token = ""
browser = None
def __init__(self, email, password):
self.email = email
self.password = password
self.login()
self.get_constants()
def login(self):
""" Login to Dropbox and return mechanize browser instance """
# Fire up a browser using mechanize
self.browser = mechanize.Browser()
self.browser.set_handle_equiv(False)
self.browser.set_handle_redirect(True)
self.browser.set_handle_referer(True)
self.browser.set_handle_robots(False)
self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:14.0) Gecko/20120722 Firefox/14.0.1')]
# Browse to the login page
self.browser.open('https://www.dropbox.com/login')
# Enter the username and password into the login form
isLoginForm = lambda l: l.action == "https://www.dropbox.com/login" and l.method == "POST"
try:
self.browser.select_form(predicate=isLoginForm)
except:
self.browser = None
raise(Exception('Unable to find login form'))
self.browser['login_email'] = self.email
self.browser['login_password'] = self.password
self.browser['t'] = "1230"
# Send the form
response = self.browser.submit()
def get_constants(self):
""" Load constants from page """
home_src = self.browser.open('https://www.dropbox.com/home').read()
try:
self.root_ns = re.findall(r"root_ns: (\d+)", home_src)[0]
self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
except:
raise(Exception("Unable to find constants for AJAX requests"))
def upload_file(self, local_file, remote_dir, remote_file):
""" Upload a local file to Dropbox """
if(not self.is_logged_in()):
raise(Exception("Can't upload when not logged in"))
self.browser.open('https://www.dropbox.com/')
# Add our file upload to the upload form
isUploadForm = lambda u: u.action == "https://dl-web.dropbox.com/upload" and u.method == "POST"
try:
self.browser.select_form(predicate=isUploadForm)
except:
raise(Exception('Unable to find upload form'))
self.browser.form.find_control("dest").readonly = False
self.browser.form.set_value(remote_dir, "dest")
self.browser.form.add_file(open(local_file, "rb"), "", remote_file)
# Submit the form with the file
self.browser.submit()
def get_dir_list(self, remote_dir):
""" Get file info for a directory """
if(not self.is_logged_in()):
raise(Exception("Can't download when not logged in"))
req_vars = "ns_id=" + self.root_ns + "&referrer=&t=" + self.token
req = urllib2.Request('https://www.dropbox.com/browse' + remote_dir, data=req_vars)
req.add_header('Referer', 'https://www.dropbox.com/home' + remote_dir)
dir_info = json.loads(self.browser.open(req).read())
dir_list = {}
for item in dir_info['file_info']:
# Eliminate directories
if(item[0] == False):
# get local filename
absolute_filename = item[3]
local_filename = re.findall(r".*\/(.*)", absolute_filename)[0]
# get file URL and add it to the dictionary
file_url = item[8]
dir_list[local_filename] = file_url
return dir_list
def get_download_url(self, remote_dir, remote_file):
""" Get the URL to download a file """
return self.get_dir_list(remote_dir)[remote_file]
def download_file(self, remote_dir, remote_file, local_file):
""" Download a file and save it locally """
fh = open(local_file, "wb")
fh.write(self.browser.open(self.get_download_url(remote_dir, remote_file)).read())
fh.close()
def is_logged_in(self):
""" Checks if a login has been established """
if(self.browser):
return True
else:
return False
def getEventsPage(self, n):
if(not self.is_logged_in()):
raise(Exception("Can't get event page when not logged in"))
url = 'https://www.dropbox.com/next_events'
values = {'cur_page': n, 'ns_id': 'false'}
data = urllib.urlencode(values)
req = mechanize.Request(url, data)
# print url + '?' + data
eventSrc = self.browser.open(req).read()
return eventSrc
And this is the loop that parses the events pages:
from dbupload import DropboxConnection
from getpass import getpass
from bs4 import BeautifulSoup
import re
import parsedatetime.parsedatetime as pdt
import parsedatetime.parsedatetime_consts as pdc
c = pdc.Constants()
p = pdt.Calendar(c)
email = "myemail#gmail.com" # raw_input("Enter Dropbox email address:")
password = getpass("Enter Dropbox password:")
dateFile = open('all_file_updates.txt', "wb")
try:
# Create the connection
conn = DropboxConnection(email, password)
except:
print("Connection failed")
else:
print("Connection succesful")
n = 250
found = 0
while(n >= 0):
eventsPageSrc = conn.getEventsPage(n)
soup = BeautifulSoup(eventsPageSrc)
table = soup.find("table", {"id": "events"})
for row in table.findAll('tr'):
link = row.find("a", href=re.compile('^https://dl-web.dropbox.com/get/ProjectName'))
if(link != None):
dateString = row.find("td", attrs={'class': 'modified'}).string
date = p.parse(dateString)
dateFile.write('Date: ' + str(date) + ' file: ' + link.string + '\n')
found = found + 1
n = n - 1
print 'page: ' + str(n) + ' Total found: ' + str(found)

In def get_constants(self): change
self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
to
self.token = re.findall(r'TOKEN: "(.+)"', home_src)[0]
dropbox has changed the way it stores constants
Hope it helps.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

urlopen error when scrapy spider is run from a script - python

I could only guess since you didn't provide a MCVE. However I'd say in your function create_link, this line: start_urls = [tc_url] should really be: self.start_urls = [tc_url]

Related

How to run Scrapy in a while loop

Use scrapy from a py file (not in command line)

Can not call build_in function

when writing scrapy pipelines data into database, the error is too many values to unpack

Getting 403 error when trying to parse dropbox events page with python and mechanize

Categories

Resources