How can I check whether a thread is completed is completed in Python? I have this code:
async def transcribe():
# Initializes the Deepgram SDK
global response
dg_client = Deepgram(DEEPGRAM_API_KEY)
global filename
global PATH_TO_FILE
PATH_TO_FILE = filename
with open(filename, 'rb') as audio:
source = {'buffer': audio, 'mimetype': MIMETYPE}
options = {'punctuate': True, 'language': 'en-US'}
print('Requesting transcript...')
print('Your file may take up to a couple minutes to process.')
print('While you wait, did you know that Deepgram accepts over 40 audio file formats? Even MP4s.')
print('To learn more about customizing your transcripts check out developers.deepgram.com')
response = await dg_client.transcription.prerecorded(source, options)
print(response)
print(response['results']['channels'][0]['alternatives'][0]['transcript'])
def runTranscribe():
asyncio.run(transcribe())
thread = threading.Thread(
target = runTranscribe
)
I found about the is_alive() method, but it is to find whether it is alive, not to find whether it is finished. So it's gonna be great if someone can help me. I'm using Python 3.10. Thank you!
while True:
if thread.is_alive():
pass
else:
#do something
break
Related
So I was writing a python script using telebot and got an error
A request to the Telegram API was unsuccessful. Error code: 400.
Description: Bad Request: file must be non-empty
I have tried different methods from many forums, but nothing helps
import telebot
import random
import time
token = #token here
bot = telebot.TeleBot(token)
shit = ["C:\\Users\\glebc\\Documents\\source(bot)\\3wZ3.gif.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\65216814_456719028224290_7745639790787166208_n.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\doc_2022-03-10_16-41-49.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\doc_2022-03-10_16-42-04.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\doc_2022-03-10_16-42-39.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\giphy.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\IMG_0080.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\IMG_0835.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\IMG_1362.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\IMG_4698.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\IMG_4962.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\IMG_6359.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\IMG_7497.MOV", "C:\\Users\\glebc\\Documents\\source(bot)\\IMG_7909.MOV", "C:\\Users\\glebc\\Documents\\source(bot)\\IMG_9540.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\mp4.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\video.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\комочек тьмы.mp4", "C:\\Users\\glebc\\Documents\\source(bot)\\кот.mp4"]
video = open(shit[random.randint(0, len(shit)-1)], 'rb')
#bot.message_handler(commands=['start'])
def start_message(message):
bot.send_message(message.chat.id, 'hello message 1')
#bot.message_handler(commands=['haha'])
def haha_message(message):
while True:
bot.send_video(message.chat.id, vidos)
time.sleep(3600) #1 hour
#bot.message_handler(commands=['hehe'])
def shit_message(message):
bot.send_video(message.chat.id, vidos)
bot.infinity_polling()
Also i dont understand error cause i dont close file only open
Problem can be because you open file only once and you never close it and open again.
When it reads then it move special pointer which shows where to read next time. When it reads to the end of file then this pointer is moved to the end of file and when it tries to read again then it trires to read from the end of file and there is nothing to read and it may say that you have empty file.
After reading you may have to use vidoe.seek(0) to move pointer to the beginnig of file.
Or you should close and open it again. And this can be even more useful because at this moment you select random file only once and later it would use always the same path. You should use random inside loop.
#bot.message_handler(commands=['haha'])
def haha_message(message):
while True:
video = open( random.choice(shit), 'rb')
bot.send_video(message.chat.id, video)
video.close()
time.sleep(3600) # 1 hour
and the same in other functions
#bot.message_handler(commands=['hehe'])
def shit_message(message):
video = open( random.choice(shit), 'rb')
bot.send_video(message.chat.id, video)
video.close()
BTW:
telegram may has some methods to execute tasks periodically.
For example module python-telegram-bot has telegram.ext.jobqueue for this.
Full working code
For tests I set logging.DEBUG to see all error messages.
Normally telebot catchs all errors and hides them.
I also used with open() as video so it automatically closes file.
import os
import random
import logging
import telebot
# display errors
telebot.logger.setLevel(logging.DEBUG)
TOKEN = os.getenv('TELEGRAM_TOKEN')
bot = telebot.TeleBot(TOKEN)
all_videos = [
"C:\\Users\\glebc\\Documents\\source(bot)\\3wZ3.gif.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\65216814_456719028224290_7745639790787166208_n.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\doc_2022-03-10_16-41-49.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\doc_2022-03-10_16-42-04.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\doc_2022-03-10_16-42-39.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\giphy.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\IMG_0080.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\IMG_0835.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\IMG_1362.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\IMG_4698.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\IMG_4962.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\IMG_6359.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\IMG_7497.MOV",
"C:\\Users\\glebc\\Documents\\source(bot)\\IMG_7909.MOV",
"C:\\Users\\glebc\\Documents\\source(bot)\\IMG_9540.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\mp4.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\video.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\комочек тьмы.mp4",
"C:\\Users\\glebc\\Documents\\source(bot)\\кот.mp4"
]
#bot.message_handler(commands=['start'])
def start_message(message):
bot.send_message(message.chat.id, 'hello message 1')
#bot.message_handler(commands=['haha'])
def haha_message(message):
while True:
with open(random.choice(all_videos), 'rb') as video:
bot.send_video(message.chat.id, video)
time.sleep(3600) # 1 hour
#bot.message_handler(commands=['hehe'])
def shit_message(message):
with open(random.choice(all_videos), 'rb') as video:
bot.send_video(message.chat.id, video)
bot.infinity_polling()
I'm trying to use the catch_up() function to get all file updates on boot, however, everytime I run my code, only half of the file is downloaded, sometimes the file is completely empt.
However, when I try to run with "iter_messages" I manage to download everything perfectly.
HELP!?
#client.on(events.NewMessage)
async def new_messages(event):
if hasattr(event.message.peer_id, "channel_id"):
print("Um dos canais");
else:
if hasattr(event.message.peer_id, 'chat_id'):
print("Tipo: ","chat");
dialog = int(event.message.peer_id.chat_id);
else:
print("Tipo: ","conversa");
dialog = int(event.message.peer_id.user_id)
'''getting the files'''
path = ""
if hasattr(event.media, "document"):
print("================\n", event.message.id, "\n================");
path = await client.download_media(event.media, file="arquivos_chimera/");
print(event)
if hasattr(event.media, "photo"):
print("================\n", event.message.id, "\n================");
path = await client.download_media(event.media, file="imagens_chimera/")
print(event)
'''getting the Telegram date'''
data = str(event.message.date);
'''text of the message'''
temp_message = await async_ajuste_SQL(event.message.message);
if path != "":
temp_message = path + " - " + temp_message;
'''Quem enviou a mensagem'''
if event.message.from_id==None:
from_ = event.message.peer_id.user_id;
else:
from_ = event.message.from_id.user_id
cur.execute(f"insert into tabela_de_mensagens values ({event.message.id}, {dialog}, {from_}, '{data}', '{temp_message}', 0);");
con.commit();
async def main():
await client.catch_up();
NOTE: the problem only ocurrs to images, delete, edit and new message updates come perfectly
So, after some testing, I realized that the problem was the fact that I was using an event handler without using a keep alive function, i.e., the event handler only works while the main function works, so, if you try to run the event handler with catch_up alone, it will only get the first updates, but will stop shortly after that (hence, why my image files were created, but not completed).
To get a solution, you can look at the following links:
https://github.com/LonamiWebs/Telethon/issues/1534
https://github.com/LonamiWebs/Telethon/issues/3146
https://docs.python.org/3.8/library/asyncio-task.html#asyncio.wait
I am downloading a lot of files from a website and want them to run parallel because they are heavy. Unfourtanetly I can't really share the website because to access the files I need a username and password which I can't share. The code below is my code, I know it can't really be run without the website and my username and password but I am 99% sure I am not allowed to share that information
import os
import requests
from multiprocessing import Process
dataset="dataset_name"
################################
def down_file(dspath, file, savepath, ret):
webfilename = dspath+file
file_base = os.path.basename(file)
file = join(savepath, file_base)
print('...Downloading',file_base)
req = requests.get(webfilename, cookies = ret.cookies, allow_redirects=True, stream=True)
filesize = int(req.headers['Content-length'])
with open(file, 'wb') as outfile:
chunk_size=1048576
for chunk in req.iter_content(chunk_size=chunk_size):
outfile.write(chunk)
return None
################################
##Download files
def download_files(filelist, c_DateNow):
## Authenticate
url = 'url'
values = {'email' : 'email', 'passwd' : "password", 'action' : 'login'}
ret = requests.post(url, data=values)
## Path to files
dspath = 'datasetwebpath'
savepath = join(path_script, dataset, c_DateNow)
makedirs(savepath, exist_ok = True)
#"""
processes = [Process(target=down_file, args=(dspath, file, savepath, ret)) for file in filelist]
print(["dspath, %s, savepath, ret\n"%(file) for file in filelist])
# kick them off
for process in processes:
print("\n", process)
process.start()
# now wait for them to finish
for process in processes:
process.join()
#"""
####### This works and it's what i want to parallelize
"""
##Download files
for file in filelist:
down_file(dspath, file, savepath, ret)
#"""
################################
def main(c_DateNow, c_DateIni, c_DateFin):
## Other code
files=["list of web file addresses"]
print(" ...Files being downladed\n ", "\n ".join(files), "\n")
## Doanlad files
download_files(files, c_DateNow)
I want to download 25 files. When I run the code all the print lines that have been printed before in the code are being reprinted even though the Process execution is not even near them. I am also getting the following error constantly
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
I googled the error and don't know how to fix it. Does it have to do with there not being enough cores? Is there a way to stop the Process depending on how many cores I have available? Or is it something else entirely?
In a question here, I read that the Process has to be within the __main__ function but this code is a module that gets imported in another code so when I run it I run it as
import this_code
import another1_code
import another2_code
#Step1
another1_code.main()
#Step2
c_DateNow, c_DateIni, c_DateFin = another2_code.main()
#Step3
this_code.main(c_DateNow, c_DateIni, c_DateFin)
#step4
## More code
So I need the process to be within a function and not in __main__
I appreciate any help or suggestions on how to correctly parallelize the above code in a way that allows me to use it as a module in another code.
EDIT: I think I've figured out a solution using subprocess.Popen with separate .py files for each stream being monitored. It's not pretty, but it works.
I'm working on a script to monitor a streaming site for several different accounts and to record when they are online. I am using the livestreamer package for downloading a stream when it comes online, but the problem is that the program will only record one stream at a time. I have the program loop through a list and if a stream is online, start recording with subprocess.call(["livestreamer"... The problem is that once the program starts recording, it stops going through the loop and doesn't check or record any of the other livestreams. I've tried using Process and Thread, but none of these seem to work. Any ideas?
Code below. Asterisks are not literally part of code.
import os,urllib.request,time,subprocess,datetime,random
status = {
"********":False,
"********":False,
"********":False
}
def gen_name(tag):
return stuff <<Bunch of unimportant code stuff here.
def dl(tag):
subprocess.call(["livestreamer","********.com/"+tag,"best","-o",".\\tmp\\"+gen_name(tag)])
def loopCheck():
while True:
for tag in status:
data = urllib.request.urlopen("http://*******.com/" + tag + "/").read().decode()
if data.find(".m3u8") != -1:
print(tag + " is online!")
if status[tag] == False:
status[tag] = True
dl(tag)
else:
print(tag+ " is offline.")
status[tag] = False
time.sleep(15)
loopCheck()
Recently I am working on a tiny crawler for downloading images on a url.
I use openurl() in urllib2 with f.open()/f.write():
Here is the code snippet:
# the list for the images' urls
imglist = re.findall(regImg,pageHtml)
# iterate to download images
for index in xrange(1,len(imglist)+1):
img = urllib2.urlopen(imglist[index-1])
f = open(r'E:\OK\%s.jpg' % str(index), 'wb')
print('To Read...')
# potential timeout, may block for a long time
# so I wonder whether there is any mechanism to enable retry when time exceeds a certain threshold
f.write(img.read())
f.close()
print('Image %d is ready !' % index)
In the code above, the img.read() will potentially block for a long time, I hope to do some retry/re-open the image url operation under this issue.
I also concern on the efficient perspective of the code above, if the number of the images to be downloaded is somewhat big, using a thread pool to download them seems to be better.
Any suggestions? Thanks in advance.
p.s. I found the read() method on img object may cause blocking, so adding a timeout parameter to the urlopen() alone seems useless. But I found file object has no timeout version of read(). Any suggestions on this ? Thanks very much .
The urllib2.urlopen has a timeout parameter which is used for all blocking operations (connection buildup etc.)
This snippet is taken from one of my projects. I use a thread pool to download multiple files at once. It uses urllib.urlretrieve but the logic is the same. The url_and_path_list is a list of (url, path) tuples, the num_concurrent is the number of threads to be spawned, and the skip_existing skips downloading of files if they already exist in the filesystem.
def download_urls(url_and_path_list, num_concurrent, skip_existing):
# prepare the queue
queue = Queue.Queue()
for url_and_path in url_and_path_list:
queue.put(url_and_path)
# start the requested number of download threads to download the files
threads = []
for _ in range(num_concurrent):
t = DownloadThread(queue, skip_existing)
t.daemon = True
t.start()
queue.join()
class DownloadThread(threading.Thread):
def __init__(self, queue, skip_existing):
super(DownloadThread, self).__init__()
self.queue = queue
self.skip_existing = skip_existing
def run(self):
while True:
#grabs url from queue
url, path = self.queue.get()
if self.skip_existing and exists(path):
# skip if requested
self.queue.task_done()
continue
try:
urllib.urlretrieve(url, path)
except IOError:
print "Error downloading url '%s'." % url
#signals to queue job is done
self.queue.task_done()
When you create tje connection with urllib2.urlopen(), you can give a timeout parameter.
As described in the doc :
The optional timeout parameter specifies a timeout in seconds for
blocking operations like the connection attempt (if not specified, the
global default timeout setting will be used). This actually only works
for HTTP, HTTPS and FTP connections.
With this you will be able to manage a maximum waiting duration and catch the exception raised.
The way I crawl a huge batch of documents is by having batch processor which crawls and dumps constant sized chunks.
Suppose you are to crawl a pre-known batch of say 100K documents. You can have some logic to generate constant size chunks of say 1000 documents which would be downloaded by a threadpool. Once the whole chunk is crawled, you can have bulk insert in your database. And then proceed with further 1000 documents and so on.
Advantages you get by following this approach:
You get the advantage of threadpool speeding up your crawl rate.
Its fault tolerant in the sense, you can continue from the chunk where it last failed.
You can have chunks generated on the basis of priority i.e. important documents to crawl first. So in case you are unable to complete the whole batch. Important documents are processed and less important documents can be picked up later on the next run.
An ugly hack that seems to work.
import os, socket, threading, errno
def timeout_http_body_read(response, timeout = 60):
def murha(resp):
os.close(resp.fileno())
resp.close()
# set a timer to yank the carpet underneath the blocking read() by closing the os file descriptor
t = threading.Timer(timeout, murha, (response,))
try:
t.start()
body = response.read()
t.cancel()
except socket.error as se:
if se.errno == errno.EBADF: # murha happened
return (False, None)
raise
return (True, body)