This question already has answers here:
Client.__init__() missing 1 required keyword-only argument: 'intents'
(4 answers)
Closed 6 months ago.
I have a Discord Bot for UKHotDeals, but it throws an error.
This is written for Python 3.x.
The original repository can be find in here: https://github.com/davidteather/Hotukdeals-Discord-Notifier
Traceback (most recent call last):
File "C:\Users\USER\Desktop\Hotukdeals-Discord-Notifier-master\main.py", line 179, in <module>
client = MyClient(channel_id)
File "C:\Users\USER\Desktop\Hotukdeals-Discord-Notifier-master\main.py", line 31, in __init__
super().__init__(*args, **kwargs)
TypeError: Client.__init__() missing 1 required keyword-only argument: 'intents'
I can't get where I'm missing something in the code, which is this:
import discord
import asyncio
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import json
with open('settings.json') as data:
settings = json.load(data)
min_upvotes = int(settings["min_upvotes"])
max_upvotes = int(settings["max_upvotes"])
base_url = settings["base_url"]
pages_to_index = int(settings["pages_to_index"])
discord_api_key = settings["discord_api_token"]
min_price = float(settings["min_price"])
max_price = float(settings["max_price"])
channel_id = int(settings["discord_channel_id"])
time_interval_seconds = int(settings["time_interval_seconds"])
class MyClient(discord.Client):
def __init__(self, channel, *args, **kwargs):
self.outOfStock = []
self.checkUrls = []
self.channelID = channel
super().__init__(*args, **kwargs)
# create the background task and run it in the background
self.bg_task = self.loop.create_task(self.my_background_task())
# Check deals
def checkDealsBeautifulSoup(self, url):
# Imports
import requests
from bs4 import BeautifulSoup
import json
import random
# Loads JSON and vars
with open('settings.json') as data:
settings = json.load(data)
min_upvotes = int(settings["min_upvotes"])
max_upvotes = int(settings["max_upvotes"])
min_price = float(settings["min_price"])
max_price = float(settings["max_price"])
# Loads proxies
with open('proxies.txt', 'r') as proxies:
proxies = proxies.readlines()
# Picks random proxy
proxy = random.choice(proxies)
returnMsgs = []
newArray = []
# Reads already used things
with open('data/usedLinks.txt', 'r') as data:
usedArray = data.readlines()
# Sets up proxy
proxies = {
"http": "http://" + proxy,
"https": "https://" + proxy,
}
page = requests.get(url, proxies=proxy)
soup = BeautifulSoup(page.text, 'html.parser')
var = False
# Tries to get things
try:
listings = soup.find_all(
'article', attrs={'data-handler': 'history'})
upvotes = soup.find_all('span', attrs={'class': 'cept-vote-temp'})
pricing = soup.find_all('span', attrs={'class': 'thread-price'})
urls = soup.find_all(
'a', attrs={'class': 'cept-thread-image-link'})
var = True
except:
var = False
if var == True:
upvotesIndex = 0
index = 0
for x in range(0, len(listings)):
try:
upvote = upvotes[upvotesIndex].text.strip().replace(
" ", "").replace("°", "").replace("\n", "")
if "Deal" in upvote or "alerts" in upvote:
upvotesIndex += 1
upvote = upvotes[upvotesIndex].text.strip().replace(
" ", "").replace("°", "").replace("\n", "")
except:
upvote = 0
try:
price = pricing[index].text.strip().replace("£", "")
except:
price = 0
try:
url = urls[index].get('href')
except:
url = None
if price != "FREE":
try:
price = float(price.replace(",", ""))
except:
price = 0
else:
price = 0
if min_price <= price <= max_price:
if min_upvotes <= int(upvote) <= max_upvotes:
if url != None:
if url + "\n" not in usedArray:
# Return Message
message = url + " Satisfies your deal criteria. It is at " + \
str(upvote) + \
" degrees and costs £" + str(price)
returnMsgs.append(message)
usedArray.append(url)
newArray.append(url)
upvotesIndex += 1
index += 1
# Saves new logged files
with open('data/usedLinks.txt', 'a') as fileObj:
for line in newArray:
fileObj.write(line + "\n")
# Returns stuff
return returnMsgs
# On start
async def on_ready(self):
print('Logged in as')
print(self.user.name)
print(self.user.id)
print('------')
# On message
async def on_message(self, message):
if message.author.id == self.user.id:
return
# Background manager
async def my_background_task(self):
await self.wait_until_ready()
channel = self.get_channel(int(channel_id))
while not self.is_closed():
for page in range(0, int(pages_to_index)):
print('checking page ' + str(page))
res = self.checkDealsBeautifulSoup(
base_url + "?page=" + str(page))
if res != []:
for msg in res:
await channel.send(msg)
await asyncio.sleep(int(time_interval_seconds))
# Main
client = MyClient(channel_id)
client.run(discord_api_key)
channel_id and discord_api_key correctly set in settings.json like this:
{
"min_upvotes": "500",
"max_upvotes": "1000",
"base_url": "https://www.hotukdeals.com",
"pages_to_index": "10",
"discord_api_token": "asdAxNasdDkxNzQ1NDcasdasd4ODU1OTAxOQ.GxasdNr.Hasdv7k9Iladsdvasd67jasdasdCXHF4",
"min_price": "0",
"max_price": "500",
"discord_channel_id": "5712311231233167",
"time_interval_seconds": "1800"
}
Looking at this other thread, the discord client now uses an Intent object in its constructor
client = discord.Client(intents=discord.Intents.default())
For you, you would have to fix the following call when instantiating your own instance
It seems to be that the error comes from discord.Client,
here:
class MyClient(discord.Client):
def __init__(self, channel, *args, **kwargs):
self.outOfStock = []
self.checkUrls = []
self.channelID = channel
super().__init__(*args, **kwargs)
check this:
https://discordpy.readthedocs.io/en/stable/api.html#discord.Client.intents
https://discordpy.readthedocs.io/en/stable/api.html#discord.Intents
Related
I am pulling video stats from youtube and would to automate the pull request. At the moment the code gets all the videos into a list from the API but when I run it again it loads the stats into a list from scratch. Can I have it so that it only adds the new video stats to the list. No rows should be added if no new video was uploaded.
Need to replace api key and channel id for code to work
file1
from UCODE import Channel_Stats
API_KEY = "<API>"
channel_id = ["UCLuR42wJEtpX5I-FaTasdcvViA","UCLuR42wJEtpX5IasdFaTasdcvViA"]
def get_stats(channel_id):
yt = Channel_Stats(API_KEY, channel_id)
yt.extract_all()
t = yt.save_and_return()
return t
file2
import json
import requests
from tqdm import tqdm
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
class Channel_Stats:
def __init__(self, api_key, channel_id):
self.api_key = api_key
self.channel_id = channel_id
self.channel_statistics = None
self.video_data = None
def extract_all(self):
self.get_channel_statistics()
self.get_channel_video_data()
def get_channel_statistics(self):
"""Extract the channel statistics"""
print('get channel statistics...')
url = f'https://www.googleapis.com/youtube/v3/channels?part=statistics&id={self.channel_id}&key={self.api_key}'
pbar = tqdm(total=1)
json_url = requests.get(url)
data = json.loads(json_url.text)
try:
data = data['items'][0]['statistics']
except KeyError:
print('Could not get channel statistics')
data = {}
self.channel_statistics = data
pbar.update()
pbar.close()
return data
def get_channel_video_data(self):
"Extract all video information of the channel"
print('get video data...')
channel_videos, channel_playlists = self._get_channel_content(limit=50)
parts = ["snippet", "statistics", "contentDetails", "topicDetails"]
for video_id in tqdm(channel_videos):
for part in parts:
data = self._get_single_video_data(video_id, part)
channel_videos[video_id].update(data)
self.video_data = channel_videos
return channel_videos
def _get_single_video_data(self, video_id, part):
"""
Extract further information for a single video
parts can be: 'snippet', 'statistics', 'contentDetails', 'topicDetails'
"""
url = f"https://www.googleapis.com/youtube/v3/videos?part={part}&id={video_id}&key={self.api_key}"
json_url = requests.get(url)
data = json.loads(json_url.text)
try:
data = data['items'][0][part]
except KeyError as e:
print(f'Error! Could not get {part} part of data: \n{data}')
data = dict()
return data
def _get_channel_content(self, limit=None, check_all_pages=True):
"""
Extract all videos and playlists, can check all available search pages
channel_videos = videoId: title, publishedAt
channel_playlists = playlistId: title, publishedAt
return channel_videos, channel_playlists
"""
url = f"https://www.googleapis.com/youtube/v3/search?key={self.api_key}&channelId={self.channel_id}&part=snippet,id&order=date"
if limit is not None and isinstance(limit, int):
url += "&maxResults=" + str(limit)
vid, pl, npt = self._get_channel_content_per_page(url)
idx = 0
while (check_all_pages and npt is not None and idx < 10):
nexturl = url + "&pageToken=" + npt
next_vid, next_pl, npt = self._get_channel_content_per_page(nexturl)
vid.update(next_vid)
pl.update(next_pl)
idx += 1
return vid, pl
def _get_channel_content_per_page(self, url):
"""
Extract all videos and playlists per page
return channel_videos, channel_playlists, nextPageToken
"""
json_url = requests.get(url)
data = json.loads(json_url.text)
channel_videos = dict()
channel_playlists = dict()
if 'items' not in data:
print('Error! Could not get correct channel data!\n', data)
return channel_videos, channel_videos, None
nextPageToken = data.get("nextPageToken", None)
item_data = data['items']
for item in item_data:
try:
kind = item['id']['kind']
published_at = item['snippet']['publishedAt']
title = item['snippet']['title']
if kind == 'youtube#video':
video_id = item['id']['videoId']
channel_videos[video_id] = {'publishedAt': published_at, 'title': title}
elif kind == 'youtube#playlist':
playlist_id = item['id']['playlistId']
channel_playlists[playlist_id] = {'publishedAt': published_at, 'title': title}
except KeyError as e:
print('Error! Could not extract data from item:\n', item)
return channel_videos, channel_playlists, nextPageToken
def save_and_return(self):
"""Dumps channel statistics and video data in a single json file"""
if self.channel_statistics is None or self.video_data is None:
print('data is missing!\nCall get_channel_statistics() and get_channel_video_data() first!')
return
fused_data = {'channel_videos_stats': self.video_data}
vidoeId = [i for i in self.video_data]
data = []
for i in self.video_data:
self.video_data[i]['videoId'] = i
data.append(self.video_data[i])
data = pd.DataFrame(data)
data['thumbnails'] = [data['thumbnails'][i]['default']['url'] for i in range(len(data))]
transcript = []
for i in range(len(data)):
try:
transcript.append(YouTubeTranscriptApi.get_transcript(data['videoId'][i]))
except:
transcript.append(None)
data['transcript'] = transcript
# data['transcript'] = [YouTubeTranscriptApi.get_transcript(data['videoId'][i]) for i in range(len(data))]
channel_title = self.video_data.popitem()[1].get('channelTitle', self.channel_id)
channel_title = channel_title.replace(" ", "_").lower()
# data.to_excel(channel_title+'.xlsx')
# print('file dumped to', filename)
return data
I'm trying to get coin value on parasawp but the Async way does not work while the sync yes.
I've tried to remove ssl, wait betweens requests, limit TCP but in the end nothing worked correctly with async/aiohttp ...
I'm working on a old raspberry pi if it's useful.
here's my code :
async def get_price_as():
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False, limit=8)) as session:
req_url = "https://apiv5.paraswap.io/prices/"
param = {}
param["srcDecimals"] = str(18)
param["destDecimals"] = str(18)
param["amount"] = str(10**18)
for res in os.listdir(wu.chemin_cles):
W3[res] = Web3(Web3.HTTPProvider(wu.url_provider[res]["rpc"]))
clef_privee[res] = wu.cle_privee(res)
param["network"] = wu.chain_ID(res)
for a in adresses[res]:
for b in adresses[res]:
if b!=a:
param["srcToken"] = a
param["destToken"] = b
async with session.get(req_url,params=param) as resp:
response = await resp.json(content_type=None)
if response.get('priceRoute') == None:
print(res, " ", a, "/", b, " : ", response['error'])
c_1 += 1
r = paraswap.get_price(adresses[res][a], "18", adresses[res][b], "18", wu.chain_ID(res), str(10**18))
if r!=1:
c_2+=1
print(float(r["priceRoute"]["destAmount"])/10**18, "sync ...")
else :
response = float(response["priceRoute"]["destAmount"])/10**18
print(res, " ", a, "/", b, " : ", response)
asyncio.run(get_price_as())
# paraswap
def get_price(srcToken, srcDecimals, destToken, destDecimals, chainID, amount):
req_url = "https://apiv5.paraswap.io/prices/"
param = {}
param["srcToken"] = srcToken
param["srcDecimals"] = srcDecimals
param["destToken"] = destToken
param["destDecimals"] = destDecimals
param["network"] = chainID
param["amount"] = amount
response = requests.get(req_url,params=param)
if response.status_code == 200:
return response.json()
else:
return 1
it's not pretty yet but I just want something that work for now ... thx
I'm not very experienced programming in Python but I've also seen in several posts that this problem might be solved by adding self to the method definition. The problem is that I actually already did this right from the beginning. This is the code I'm using:
### API MANAGER ###
class api_manager():
import requests, time, pandas as pd
#from ipython.display import display, HTML
# heroes = []
# items = []
# token = ''
# url = ""
def __init__(self, api_identifier = 1):
test_api_url = "https://api.steampowered.com/IDOTA2Match_570/"
live_api_url = "https://api.steampowered.com/IDOTA2Match_205790/"
self.heroes = []
self.items = []
self.token = ''
self.url = ""
if api_identifier == 1:
self.url = live_api_url
else:
self.url = test_api_url
self.get_access_token()
self.initialize_heroes()
self.initialize_items()
pass
def get_access_token(self):
with open("conf/access.config") as file:
self.token = file.read().split(":")[1]
file.close
pass
def initialize_heroes(self):
response = self.requests.get(self.url + "GetHeroes/v1/?format=JSON&language=en_us&key=" + self.token)
hero_list = response.json()
for hero_id in range(len(hero_list['result']['heroes'])):
self.heroes.append([hero_list['result']['heroes'][hero_id]['id'], hero_list['result']['heroes'][hero_id]['localized_name'], hero_list['result']['heroes'][hero_id]['name'].replace('npc_dota_hero_', "").replace("_", " ")])
self.heroes.sort()
heroes_df = self.pd.DataFrame(self.heroes, columns=["ID", "Hero", "Hero Tag"])
self.pd.set_option('display.max_colwidth', -1)
#display(HTML(heroes_df.to_html(index = False)))
pass
def initialize_items(self):
response = self.requests.get(self.url + "GetGameItems/v1/?format=JSON&language=en_us&key=" + self.token)
item_list = response.json()
for item_id in range(len(item_list['result']['items'])):
self.items.append([item_list['result']['items'][item_id]['id'], item_list['result']['items'][item_id]['localized_name'], response.json()['result']['items'][item_id]['name']])
self.items.sort()
items_df = self.pd.DataFrame(self.items, columns=["ID", "Item", "Item Tag"])
self.pd.set_option('display.max_colwidth', -1)
#display(HTML(items_df.to_html(index = False)))
pass
def get_match_details(match_id, self):
response = self.requests.get(self.url + "GetMatchDetails/V001/?format=JSON&language=en_us&key=" + self.token + "&match_id=" + str(match_id))
print(response.json())
pass
def get_match_details_in_range(match_id, match_id_upper_bound, self):
for next_match_id in range(match_id, match_id_upper_bound):
response = self.requests.get(self.url + "GetMatchDetails/V001/?format=JSON&language=en_us&key=" + self.token + "&match_id=" + str(next_match_id))
print(response.json())
self.time.sleep(1.05)
pass
And this is the error I'm getting in Python3 console run in Windows cmd:
>>> instance = api_manager()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "E:\Dropbox\DotA 2 WebAPI Development\Executable Python Files\dota_api_manager.py", line 22, in __init__
self.get_access_token()
TypeError: get_access_token() takes 0 positional arguments but 1 was given
I don't know what exactly I'm doing wrong. I already tried different things in my code but never got it to actually work. There was one moment when the code executed properly and I could actually call the method get_match_details() but I got an error because the url parameter was not set for whatever reason, although it should have been initialized right in the beginning of __init__().
Thanks in advance for your help and let me know if you need any additional information.
Here is my code:
import urllib
import webbrowser
from bs4 import BeautifulSoup
import requests
import re
address = 'https://google.com/search?q='
# Default Google search address start
file = open( "OCR.txt", "rt" )
# Open text document that contains the question
word = file.read()
file.close()
myList = [item for item in word.split('\n')]
newString = ' '.join(myList)
# The question is on multiple lines so this joins them together with proper spacing
qstr = urllib.parse.quote_plus(newString)
# Encode the string
newWord = address + qstr
# Combine the base and the encoded query
response = requests.get(newWord)
#with open('output.html', 'wb') as f:
# f.write(response.content)
#webbrowser.open('output.html')
answers = open("ocr2.txt", "rt")
ansTable = answers.read()
answers.close()
ans = ansTable.splitlines()
ans1 = str(ans[0])
ans2 = str(ans[2])
ans3 = str(ans[4])
ans1Score = 0
ans2Score = 0
ans3Score = 0
links = []
soup = BeautifulSoup(response.text, 'lxml')
for r in soup.find_all(class_='r'):
linkRaw = str(r)
link = re.search("(?P<url>https?://[^\s]+)", linkRaw).group("url")
if '&' in link:
finalLink = link.split('&')
link = str(finalLink[0])
links.append(link)
#print(links)
#print(' ')
for g in soup.find_all(class_='g'):
webBlock = str(g)
ans1Tally = webBlock.count(ans1)
ans2Tally = webBlock.count(ans2)
ans3Tally = webBlock.count(ans3)
if ans1 in webBlock:
ans1Score += ans1Tally
else:
ans1Found = False
if ans2 in webBlock:
ans2Score += ans2Tally
else:
ans2Found = False
if ans3 in webBlock:
ans3Score += ans3Tally
else:
ans3Found = False
if ans1Found and ans2Found and ans3Found is False:
searchLink = str(links[0])
if searchLink.endswith('pdf'):
pass
else:
response2 = requests.get(searchLink)
soup2 = BeautifulSoup(response2.text, 'lxml')
for p in soup2.find_all('p'):
extraBlock = str(p)
extraAns1Tally = extraBlock.count(ans1)
extraAns2tally = extraBlock.count(ans2)
extraAns3Tally = extraBlock.count(ans3)
if ans1 in extraBlock:
ans1Score += extraAns1Tally
if ans2 in extraBlock:
ans2Score += extraAns2Tally
if ans3 in extraBlock:
ans3Score += extraAns3Tally
with open("Results.txt", "w") as results:
results.write(newString + '\n\n')
results.write(ans1+": "+str(ans1Score)+'\n')
results.write(ans2+": "+str(ans2Score)+'\n')
results.write(ans3+": "+str(ans3Score))
links.pop(0)
print(' ')
print('-----')
print(ans1+": "+str(ans1Score))
print(ans2+": "+str(ans2Score))
print(ans3+": "+str(ans3Score))
print('-----')
Basically right now it is scraping each "g" one at a time, when this program can benefit massively from scraping each link all at the same time. For example, I want it to have them all scraping at the same time instead of waiting until the one before it is done. Sorry if this is a simple kind of question but I have little experience with asyncio so if anyone could help that would be massively appreciated. Thanks!
To write async program you need:
define functions with async def
call it with await
create event loop and run some function in it
run requests concurrently using asyncio.gather
All other is almost same as usual. Instead of using blocking request module you should use some async one. For example, aiohttp:
python -m pip install aiohttp
And use it like this:
async def get(url):
async with aiohttp.ClientSession() as session:
async with session.get('https://api.github.com/events') as resp:
return await resp.text()
Here's code with some changes I statrted. I didn't check if it's actually works since I don't have files you use. You should also move logic inside for g in soup.find_all(class_='g'): to seperate function and run multiple of these functions with asyncio.gather to benefit of asyncio.
import asyncio
import aiohttp
import urllib
import webbrowser
from bs4 import BeautifulSoup
import re
async def get(url):
async with aiohttp.ClientSession() as session:
async with session.get('https://api.github.com/events') as resp:
return await resp.text()
async def main():
address = 'https://google.com/search?q='
# Default Google search address start
file = open( "OCR.txt", "rt" )
# Open text document that contains the question
word = file.read()
file.close()
myList = [item for item in word.split('\n')]
newString = ' '.join(myList)
# The question is on multiple lines so this joins them together with proper spacing
qstr = urllib.parse.quote_plus(newString)
# Encode the string
newWord = address + qstr
# Combine the base and the encoded query
text = await get(newWord)
#with open('output.html', 'wb') as f:
# f.write(response.content)
#webbrowser.open('output.html')
answers = open("ocr2.txt", "rt")
ansTable = answers.read()
answers.close()
ans = ansTable.splitlines()
ans1 = str(ans[0])
ans2 = str(ans[2])
ans3 = str(ans[4])
ans1Score = 0
ans2Score = 0
ans3Score = 0
links = []
soup = BeautifulSoup(text, 'lxml')
for r in soup.find_all(class_='r'):
linkRaw = str(r)
link = re.search("(?P<url>https?://[^\s]+)", linkRaw).group("url")
if '&' in link:
finalLink = link.split('&')
link = str(finalLink[0])
links.append(link)
#print(links)
#print(' ')
for g in soup.find_all(class_='g'):
webBlock = str(g)
ans1Tally = webBlock.count(ans1)
ans2Tally = webBlock.count(ans2)
ans3Tally = webBlock.count(ans3)
if ans1 in webBlock:
ans1Score += ans1Tally
else:
ans1Found = False
if ans2 in webBlock:
ans2Score += ans2Tally
else:
ans2Found = False
if ans3 in webBlock:
ans3Score += ans3Tally
else:
ans3Found = False
if ans1Found and ans2Found and ans3Found is False:
searchLink = str(links[0])
if searchLink.endswith('pdf'):
pass
else:
text2 = await get(searchLink)
soup2 = BeautifulSoup(text2, 'lxml')
for p in soup2.find_all('p'):
extraBlock = str(p)
extraAns1Tally = extraBlock.count(ans1)
extraAns2tally = extraBlock.count(ans2)
extraAns3Tally = extraBlock.count(ans3)
if ans1 in extraBlock:
ans1Score += extraAns1Tally
if ans2 in extraBlock:
ans2Score += extraAns2Tally
if ans3 in extraBlock:
ans3Score += extraAns3Tally
with open("Results.txt", "w") as results:
results.write(newString + '\n\n')
results.write(ans1+": "+str(ans1Score)+'\n')
results.write(ans2+": "+str(ans2Score)+'\n')
results.write(ans3+": "+str(ans3Score))
links.pop(0)
print(' ')
print('-----')
print(ans1+": "+str(ans1Score))
print(ans2+": "+str(ans2Score))
print(ans3+": "+str(ans3Score))
print('-----')
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()
Upd:
Main idea is to move logic inside loop that does request into separate coroutine and pass multiple of these coroutines to asyncio.gather. It will parallelize your requests.
async def main():
# Her do all that are before the loop.
coros = [
process_single_g(g)
for g
in soup.find_all(class_='g')
]
results = await asyncio.gather(*coros) # this function will run multiple tasks concurrently
# and return all results together.
for res in results:
ans1Score, ans2Score, ans3Score = res
print(' ')
print('-----')
print(ans1+": "+str(ans1Score))
print(ans2+": "+str(ans2Score))
print(ans3+": "+str(ans3Score))
print('-----')
async def process_single_g(g):
# Here do all things you inside loop for concrete g.
text2 = await get(searchLink)
# ...
return ans1Score, ans2Score, ans3Score
In the following program, I tried to call self.start_requests() in self.after_login(), but not succeed. I rewrite the content of function self.start_requests() instead and it works.
My question is that I don't understand why I can just directly call function self.start_requests()?
__author__ = 'parallels'
import scrapy
from scrapy import Request
from bs4 import BeautifulSoup
def start_requests(usrname, password):
return Request(url="http://www.heibanke.com/lesson/crawler_ex01/",
cookies={'name':usrname, 'password':password},dont_filter = True)
class heibanke2(scrapy.Spider):
name = "herbanke2"
# start_urls = ["http://www.heibanke.com/lesson/crawler_ex01/"]
password = 4
def start_requests(self):
return [Request("http://www.heibanke.com/lesson/crawler_ex01/", callback = self.post_login,dont_filter = True)]
#FormRequeset
def post_login(self, response):
print 'Preparing login'
print "current password:" , str(self.password)
return [scrapy.FormRequest.from_response(response,
formdata = {
'username': "JoseLyn",
'password': str(self.password)
},
callback = self.after_login
)]
def after_login(self,response):
print "after_login"
with open("body" + str(self.password),"wb") as f:
f.write(response.body)
soup = BeautifulSoup(response.body,"lxml")
if "JoseLyn" not in soup.h3.string:
self.password += 1
# self.start_requests()
return [Request("http://www.heibanke.com/lesson/crawler_ex01/", callback = self.post_login,dont_filter = True)]
else:
print "password found:", str(self.password)
print "next mission at:", 'http://www.heibanke.com' + soup.a['href']
Thank you in advance!