I want to scrape millions of log records from a site. so to achieve this I am running this code and it works fine, now I want to control it via Telegram Bot because I'm running it from raspberry pi 24/7 so I modified it but now it throw error There is no current event loop in thread 'Bot:5689047784:dispatcher'.
My Modified code below:
from telegram.ext.updater import Updater
from telegram.update import Update
from telegram.ext.callbackcontext import CallbackContext
from telegram.ext.commandhandler import CommandHandler
from telegram.ext.messagehandler import MessageHandler
from telegram.ext.filters import Filters
from tqdm import tqdm
import asyncio
import aiohttp
import time
import tqdm
import nest_asyncio
from asyncio import ensure_future, events
from asyncio.queues import Queue
from functools import partial
import telegram
nest_asyncio.apply()
telegramtoken = "hideforsecurity"
chatid = "hideforsecurity"
updater = Updater(telegramtoken,use_context=True)
bot = telegram.Bot(token=telegramtoken)
def as_completed_for_async_gen(fs_async_gen, concurrency):
done = Queue()
loop = events.get_event_loop()
todo = set() # +
def _on_completion(f):
todo.remove(f)
done.put_nowait(f)
loop.create_task(_add_next()) # +
async def _wait_for_one():
f = await done.get()
return f.result()
async def _add_next(): # +
try:
f = await fs_async_gen.__anext__()
except StopAsyncIteration:
return
f = ensure_future(f, loop=loop)
f.add_done_callback(_on_completion)
todo.add(f)
for _ in range(concurrency): # +
loop.run_until_complete(_add_next()) # +
while todo: # +
yield _wait_for_one() # +
CONCURRENCY = 50 # +
n = 6400010
q = 6400020
filename = str(n) + "-" + str(q) + ".json"
async def make_async_gen(f, n, q):
async for x in make_numbers(n, q):
yield f(x)
async def fetch():
# example
url = "https://httpbin.org/anything/log/"
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=CONCURRENCY)) as session:
headers = {
"User-Agent": "okhttp/3.12.1",
}
async_gen = make_async_gen(partial(do_get, session, url), n, q) # +
updater.message.reply_text("[*] Downloading started via bot")
for f in tqdm.tqdm(as_completed_for_async_gen(async_gen, CONCURRENCY), total=q-n):
response = await f
# Do something with response, such as writing to a local file
if response != "[null]":
print(response)
file1 = open(str(filename), "a") # append mode
file1.write(response + "\n")
file1.close()
# ...
#===================================================
#----------------------------------------------------------------
async def make_numbers(numbers, _numbers):
for i in range(numbers, _numbers):
yield i
async def do_get(session, url, x):
headers = {
'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'okhttp/3.12.1',
'Connection': 'close'
}
async with session.get(url + str(x), headers=headers) as response:
data = await response.text()
return data # +
s = time.perf_counter()
def start(update: Update, context: CallbackContext):
update.message.reply_text(
"Hello sir, Welcome to the Bot.Please write\
/help to see the commands available.")
def help(update: Update, context: CallbackContext):
update.message.reply_text("""Available Commands :-
/status - To get the youtube URL
/download - To get the LinkedIn profile URL""")
def status(update: Update, context: CallbackContext):
update.message.reply_text("current status is at" + str(1))
def download(update: Update, context: CallbackContext):
update.message.reply_text("[*] Downloading started via bot")
try:
c1 = str(n)
c2 = str(q)
bot.sendMessage(chat_id=chatid, text="[*] Downloading Started " + c1 +" - "+ c2)
loop = asyncio.get_event_loop()
loop.run_until_complete(fetch())
bot.sendMessage(chat_id=chatid, text="[*] Downloading Complete " + c1 +" - "+ c2)
except Exception as e:
print(e)
bot.sendMessage(chat_id=chatid, text="[*] Downloading Failed " + c1 + " - " + c2 + " << Reason >> " + str(e))
exit()
elapsed = time.perf_counter() - s
update.message.reply_text("[*] Downloading completed via bot")
def unknown(update: Update, context: CallbackContext):
update.message.reply_text(
"Sorry '%s' is not a valid command" % update.message.text)
def unknown_text(update: Update, context: CallbackContext):
update.message.reply_text(
"Sorry I can't recognize you , you said '%s'" % update.message.text)
updater.dispatcher.add_handler(CommandHandler('start', start))
updater.dispatcher.add_handler(CommandHandler('status', status))
updater.dispatcher.add_handler(CommandHandler('help', help))
updater.dispatcher.add_handler(CommandHandler('download', download))
updater.dispatcher.add_handler(MessageHandler(Filters.text, unknown))
updater.dispatcher.add_handler(MessageHandler(
Filters.command, unknown)) # Filters out unknown commands
# Filters out unknown messages.
updater.dispatcher.add_handler(MessageHandler(Filters.text, unknown_text))
updater.start_polling()
Please help & guide me if this approach of controlling program execution on raspberry is correct or Suggest me some better way.
Related
I have a discord bot written in python using the discord.py library and want to combine it with a basic selfwritten IRC client. My idea was to use the discord bot to control the IRC client (join and part channels) and run them both simultaneously.
discordbot.py:
import time
import configparser
import datetime as dt
import os
from typing import (
Any,
Optional,
Dict,
List
)
import discord
from discord.ext import commands
from irc import IRCSimpleClient
root_path = os.path.dirname(__file__)
config = configparser.ConfigParser()
config.read("config.cfg")
class Main(commands.Bot):
def __init__(self) -> None:
intents = discord.Intents.all()
super().__init__(command_prefix=commands.when_mentioned_or('!'),
intents=intents)
async def on_ready(self):
pass
def watch_message():
while True:
msg = irc.get_response()
msg = ""
if "PING" in msg:
irc.respond_ping(msg)
print(dt.datetime.strftime(dt.datetime.now(), "%H:%M") + " PONG")
try:
msg = msg.strip().split(":")
print("[{}][{}]<{}> {}".format(
dt.datetime.strftime(dt.datetime.now(), "%H:%M"),
"#" + msg[1].split(" #")[1].strip(),
msg[1].split("!")[0],
msg[2].strip()))
except IndexError:
pass
bot = Main()
#bot.command(name="join")
async def test(ctx: commands.Context):
irc.join_channel("test")
username = config["Auth"]["username"]
oauth = config["Auth"]["oauth"]
irc = IRCSimpleClient(username, oauth)
irc.connect()
irc.join_channel("lcs")
# watch_message()
token = config["discord"]["token"]
bot.run(token)
irc.py:
#!/usr/bin/env python
import socket
import time
class IRCSimpleClient():
def __init__(self, nick, oauth):
self.username = nick
self.oauth = oauth
self.server = "irc.chat.twitch.tv"
self.port = 80
def connect(self):
self.conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.conn.connect((self.server, self.port))
self.conn.send(f"PASS oauth:{self.oauth}\r\n".encode("utf-8"))
self.conn.send(f"NICK {self.username}\r\n".encode("utf-8"))
while not self.is_connected:
resp = self.get_response()
print(resp.strip())
if "376" in resp:
self.is_connected = True
if "PING" in resp:
self.respond_ping(resp)
def get_response(self):
return self.conn.recv(1024).decode("utf-8", "ignore")
def send_cmd(self, cmd, message):
command = "{} {}\r\n".format(cmd, message).encode("utf-8")
self.conn.send(command)
def send_message_to_channel(self, channel, message):
command = "PRIVMSG {}".format(channel)
message = ":" + message
self.send_cmd(command, message)
def join_channel(self, channel: str):
joined = False
cmd = "JOIN"
if not channel.startswith("#"):
channel = "#" + channel
self.send_cmd(cmd, channel)
while not joined:
resp = self.get_response()
print(resp.strip())
if "366" in resp:
joined = True
if "PING" in resp:
self.respond_ping(resp)
def part_channel(self, channel: str):
cmd = "PART"
if not channel.startswith("#"):
channel = "#" + channel
self.send_cmd(cmd, channel)
def respond_ping(self, message):
self.send_cmd("PONG", ":" + message.split(":")[1])
As far as I know, discord.py uses asyncio under the hood so I wanted to use it as well but since the IRC client is blocking when waiting to receive new messages, I'm not sure how to run both at the same time.
I tried asyncio and threading but the "watch_message" function is always blocking the discord bot run function from executing.
I'm just starting to discover how to build a bot with python. I'm trying to send a message at certain time. I read a lot of example, I read the documentation regarding modul_schedule function but I can't fix this issue...
import config
import telebot
import requests
import schedule
import time
from my_parser import parse
from bs4 import BeautifulSoup as BS
bot = telebot.TeleBot(config.token)
r = requests.get('https://example')
html = BS(r.content, 'html.parser')
for el in html.select('#content'):
t_min = el.select('.temperature .min')[0].text
t_max = el.select('.temperature .max')[0].text
min_text = el.select('.wDescription .description')[0].text
t_test = el.select('.wDescription .description')[0].text
response = requests.get(url='https://example')
data = response.json()
btc_price = f"B: {round(data.get('btc_usd').get('last'), 2)}$"
#bot.message_handler(commands=['start', 'help'])
def main(message):
bot.send_message(
message.chat.id, t_min + ', ' + t_max + '\n' + min_text + '\n' + parse() + '\n' + btc_price)
if __name__ == '__main__':
bot.polling(none_stop=True, interval=0)
schedule.every(1).seconds.do(main)
while True:
schedule.run_pending()
time.sleep(1)
I would like the bot send message every morning with temperature on to a channel. I did not find any clues on how to use the function correctly.
I use this library.
Example of my code.
import aioschedule as schedule
async def some_fun():
pass
async def scheduler():
schedule.every().day.at("09:00").do(some_fun())
while True:
await schedule.run_pending()
await asyncio.sleep(2)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.create_task(scheduler())
main()
Below is the python code (program.py) and the requirements file (requirements.txt).
Function async def get_title_range() is not working properly, it generates the following error code:
httpx.HTTPStatusError: Redirect response '301 Moved Permanently' for
url 'https://talkpython.fm/episodes/show/271' Redirect location:
'https://talkpython.fm/episodes/show/271/unlock-the-mysteries-of-time-pythons-datetime-that-is'
For more information check: https://httpstatuses.com/301
Python code, based on python 3.9 (program.py):
import asyncio
import datetime
import httpx
import bs4
from colorama import Fore
global loop
async def get_html(episode_number: int) -> str:
print(Fore.YELLOW + f"Getting HTML for episode {episode_number}", flush=True)
url = f"https://talkpython.fm/episodes/show/{episode_number}"
async with httpx.AsyncClient() as client:
resp = await client.get(url)
resp.raise_for_status()
return resp.text
def get_title(html: str, episode_number: int) -> str:
print(Fore.CYAN + f"Getting TITLE for episode {episode_number}", flush=True)
soup = bs4.BeautifulSoup(html, 'html.parser')
header = soup.select_one('h1')
if not header:
return "MISSING"
return header.text.strip()
def main():
t0 = datetime.datetime.now()
global loop
loop = asyncio.get_event_loop()
loop.run_until_complete(get_title_range())
dt = datetime.datetime.now() - t0
print(f"Done in {dt.total_seconds():.2f} sec.")
async def get_title_range()
tasks = []
for n in range(270, 280):
tasks.append((n, loop.create_task(get_html(n))))
for n, t in tasks:
html = await t
title = get_title(html, n)
print(Fore.WHITE + f"Title found: {title}", flush=True)
if __name__ == '__main__':
main()
The requirements (requitements.txt):
bs4
colorama
httpx
Here is my code using request-html ASyncHtmlSession in Fast api
#app.get('/')
async def ScrapeData(pages:Optional[int]= 1):
crawle = Crawler()
for page in range(1,pages+1):
url = f"url here"
asession = AsyncHTMLSession()
r = await asession.get(url)
await r.html.arender(sleep=1)
widget = r.html.xpath('//*[#id="widgetContent"]')[0]
items = widget.find('div')
crawle.GetData(items)
return crawle.data
You need to explicitly enable redirects in httpx (unlike in requests). From their docs:
Unlike requests, HTTPX does not follow redirects by default.
We differ in behaviour here because auto-redirects can easily mask unnecessary network calls being made.
You can still enable behaviour to automatically follow redirects, but you need to do so explicitly...
response = client.get(url, follow_redirects=True)
Or else instantiate a client, with redirect following enabled by default...
client = httpx.Client(follow_redirects=True)
I'm creating an optimized multi-threading app using asyncio and want to add a rotating proxy into the mix.
Starting with a sample taken from this outstanding article:
Speed Up Your Python Program With Concurrency
I added a rotating proxy and it stopped working. The code simply exits the function after touching the line for the proxy.
This little snippet of code works, but not when added to the main script as shown in the screenshot above.
import asyncio
import random as rnd
async def download_site():
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
asyncio.run(download_site())
And here's the full sample:
import asyncio
import time
import aiohttp
# Sample code taken from here:
# https://realpython.com/python-concurrency/#asyncio-version
# Info for adding headers for the proxy (Scroll toward the bottom)
# https://docs.aiohttp.org/en/stable/client_advanced.html
# Good read to possible improve performance on large lists of URLs
# https://asyncio.readthedocs.io/en/latest/webscraper.html
# RUN THIS METHOD TO SEE HOW IT WORKS.
# # Original Code (working...)
# async def download_site(session, url):
# async with session.get(url, proxy="http://proxy.com") as response:
# print("Read {0} from {1}".format(response.content_length, url))
def get_proxy(self):
proxy_list = [
(754, '38.39.205.220:80'),
(681, '38.39.204.100:80'),
(682, '38.39.204.101:80'),
(678, '38.39.204.94:80')
]
proxy = random.choice(proxy_list)
print(proxy[1])
return proxy
async def download_site(session, url):
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
async with session.get(url, proxy="http://" + proxy) as response:
print("Read {0} from {1}".format(response.content_length, url))
async def download_all_sites(sites):
async with aiohttp.ClientSession() as session:
tasks = []
for url in sites:
task = asyncio.ensure_future(download_site(session, url))
tasks.append(task)
await asyncio.gather(*tasks, return_exceptions=True)
# Modified to loop thru only 1 URL to make debugging simple
if __name__ == "__main__":
sites = [
"https://www.jython.org",
# "http://olympus.realpython.org/dice",
] #* 80
start_time = time.time()
asyncio.get_event_loop().run_until_complete(download_all_sites(sites))
duration = time.time() - start_time
print(f"Downloaded {len(sites)} sites in {duration} seconds")
Thank you for any help you can offer.
You use return_exceptions=True but you don't actually check the returned results for errors. You can use asyncio.as_completed to handle exceptions and get the earliest next result:
import asyncio
import random
import traceback
import aiohttp
URLS = ("https://stackoverflow.com",)
TIMEOUT = 5
PROXIES = (
"http://38.39.205.220:80",
"http://38.39.204.100:80",
"http://38.39.204.101:80",
"http://38.39.204.94:80",
)
def get_proxy():
return random.choice(PROXIES)
async def download_site(session, url):
proxy = get_proxy()
print(f"Got proxy: {proxy}")
async with session.get(url, proxy=f"{proxy}", timeout=TIMEOUT) as resp:
print(f"{url}: {resp.status}")
return await resp.text()
async def main():
tasks = []
async with aiohttp.ClientSession() as session:
for url in URLS:
tasks.append(asyncio.create_task(download_site(session, url)))
for coro in asyncio.as_completed(tasks):
try:
html = await coro
except Exception:
traceback.print_exc()
else:
print(len(html))
if __name__ == "__main__":
asyncio.run(main())
I am writing a simple producer/consumer app to call multiple URL's asynchronously.
In the following code if I set the conn_count=1, and add 2 items to the Queue it works fine as only one consumer is created. But if I make conn_count=2 and add 4 items to the Queue only 3 request are being made. The other request fails with ClientConnectorError.
Can you please help be debug the reason for failure with multiple consumers? Thank You.
I am using a echo server I created.
Server:
import os
import logging.config
import yaml
from aiohttp import web
import json
def start():
setup_logging()
app = web.Application()
app.router.add_get('/', do_get)
app.router.add_post('/', do_post)
web.run_app(app)
async def do_get(request):
return web.Response(text='hello')
async def do_post(request):
data = await request.json()
return web.Response(text=json.dumps(data))
def setup_logging(
default_path='logging.yaml',
default_level=logging.INFO,
env_key='LOG_CFG'
):
path = default_path
value = os.getenv(env_key, None)
if value:
path = value
if os.path.exists(path):
with open(path, 'rt') as f:
config = yaml.safe_load(f.read())
logging.config.dictConfig(config)
else:
logging.basicConfig(level=default_level)
if __name__ == '__main__':
start()
Client:
import asyncio
import collections
import json
import sys
import async_timeout
from aiohttp import ClientSession, TCPConnector
MAX_CONNECTIONS = 100
URL = 'http://localhost:8080'
InventoryAccount = collections.namedtuple("InventoryAccount", "op_co customer_id")
async def produce(queue, num_consumers):
for i in range(num_consumers * 2):
await queue.put(InventoryAccount(op_co=i, customer_id=i * 100))
for j in range(num_consumers):
await queue.put(None)
async def consumer(n, queue, session, responses):
print('consumer {}: starting'.format(n))
while True:
try:
account = await queue.get()
if account is None:
queue.task_done()
break
else:
print(f"Consumer {n}, Updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}")
params = {'opCo': account.op_co, 'customerId': account.customer_id}
headers = {'content-type': 'application/json'}
with async_timeout.timeout(10):
print(f"Consumer {n}, session state " + str(session.closed))
async with session.post(URL,
headers=headers,
data=json.dumps(params)) as response:
assert response.status == 200
responses.append(await response.text())
queue.task_done()
except:
e = sys.exc_info()[0]
print(f"Consumer {n}, Error updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}. {e}")
queue.task_done()
print('consumer {}: ending'.format(n))
async def start(loop, session, num_consumers):
queue = asyncio.Queue(maxsize=num_consumers)
responses = []
consumers = [asyncio.ensure_future(loop=loop, coro_or_future=consumer(i, queue, session, responses)) for i in range(num_consumers)]
await produce(queue, num_consumers)
await queue.join()
for consumer_future in consumers:
consumer_future.cancel()
return responses
async def run(loop, conn_count):
async with ClientSession(loop=loop, connector=TCPConnector(verify_ssl=False, limit=conn_count)) as session:
result = await start(loop, session, conn_count)
print("Result: " + str(result))
if __name__ == '__main__':
conn_count = 2
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(run(loop, conn_count))
finally:
loop.close()
Reference:
https://pymotw.com/3/asyncio/synchronization.html
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
https://hackernoon.com/asyncio-for-the-working-python-developer-5c468e6e2e8e