how to download a large file faster using telethon in telegram? - python

i'm trying to do something that downloads a lot of file from a telegram channel
the code works well but it takes too long and above all that I have a slow internet connection
I have this code, I am downloading files that weigh 1gb but it takes a long time for an example to make the download faster?
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
import datetime
import os
def get_entity_data(entity_id, limit):
entity = client.get_entity(entity_id)
fecha = datetime.datetime.today()
today = fecha.day
yesterday = today - 1
posts = client(GetHistoryRequest(
peer=entity,
limit=limit,
offset_date=None,
offset_id=0,
max_id=0,
min_id=0,
add_offset=0,
hash=0))
for post in posts.messages:
post_day = post.date.day
if post_day >= yesterday:
if post.media is not None:
try:
file_name = post.media.document.attributes[0].file_name
except:
file_name = post.media.document.attributes[1].file_name
directorio = os.getcwd()+'/descargas'
if os.path.exists('descargas/'+file_name) == False:
print(file_name, 'Descargando...')
client.download_media(message=post, file=directorio)
print('Archivo descargado.')

I think you can handle it by fewer limit and set offset and using multithreaded requests, maybe pool package helps you in this approach.
for example, the limit parameter set to 10 and there exists 1000 id which you want to get, so the offset should be offset = [0, 10, 20, 30, ..., 1000]
then:
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
import datetime
import os
import pool
offsets = [0, 10, 20, 30, ..., 1000]
pool.map(get_entity_data, offsets)
def get_entity_data(entity_id={your_id}, limit=10, offset_id=0):
your function

Related

how to use FastTelethonhelper properly in an existing code?

I am trying to implement fast_download in my existing code. The problem is fast_download module uses await which I am not familiar with. The official documentation is not enough to implement for my current knowledge. Is there anyone who can help me?
here is my code.
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetDialogsRequest
from telethon.tl.functions.channels import GetFullChannelRequest
from FastTelethonhelper import fast_download
from telethon.tl.types import InputPeerEmpty
from tqdm import tqdm
api_id =
api_hash = ''
def callback(current, total):
global pbar
global prev_curr
pbar.update(current - prev_curr)
prev_curr = current
def download_media(group, cl, name):
global pbar
global prev_curr
messages = cl.get_messages(group, limit=2000)
start = 0
print(start)
for i, message in enumerate(messages[start:]):
prev_curr = 0
if message.video:
print("\n{} / {} : {}".format(i + start, len(messages), message.file.name))
pbar = tqdm(total=message.document.size, unit='B', unit_scale=True)
message.download_media('./' + name + '/', progress_callback=callback)
# fast_download(client, message.video, download_folder = './' + name + '/', progress_bar_function=callback)
pbar.close()
with TelegramClient('name', api_id, api_hash) as client:
result = client(GetDialogsRequest(
offset_date=None,
offset_id=0,
offset_peer=InputPeerEmpty(),
limit=500,
hash=0,
))
title = 'channel_name' # Title for channel
channel = client(GetFullChannelRequest(title))
download_media(channel.full_chat, client, "directory_name")
I was expecting to enhance download speed of telethon api. Download speed of some files are limited to 100 kbs, which is too slow for large files. I have already implemented a code to use with telethon built-in module "download_media".

How to fetch date and time from internet [duplicate]

I need to get the time for the UK from an NTP server. Found stuff online however any time I try out the code, I always get a return date time, the same as my computer. I changed the time on my computer to confirm this, and I always get that, so it's not coming from the NTP server.
import ntplib
from time import ctime
c = ntplib.NTPClient()
response = c.request('uk.pool.ntp.org', version=3)
response.offset
print (ctime(response.tx_time))
print (ntplib.ref_id_to_text(response.ref_id))
x = ntplib.NTPClient()
print ((x.request('ch.pool.ntp.org').tx_time))
This will work (Python 3):
import socket
import struct
import sys
import time
def RequestTimefromNtp(addr='0.de.pool.ntp.org'):
REF_TIME_1970 = 2208988800 # Reference time
client = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
data = b'\x1b' + 47 * b'\0'
client.sendto(data, (addr, 123))
data, address = client.recvfrom(1024)
if data:
t = struct.unpack('!12I', data)[10]
t -= REF_TIME_1970
return time.ctime(t), t
if __name__ == "__main__":
print(RequestTimefromNtp())
The timestamps returned as call to the NTP server returns time in seconds.
ctime() provides datetime format based on local machine's timezone settings by default. Thus, for uk timezone you need to convert tx_time using that timezone. Python's in-built datetime module contains function for this purpose
import ntplib
from datetime import datetime, timezone
c = ntplib.NTPClient()
# Provide the respective ntp server ip in below function
response = c.request('uk.pool.ntp.org', version=3)
response.offset
print (datetime.fromtimestamp(response.tx_time, timezone.utc))
UTC timezone used here. For working with different timezones you can use pytz library
This is basically Ahmads answer but working for me on Python 3. I am currently keen on Arrow as simplifying times and then you get:
import arrow
import socket
import struct
import sys
def RequestTimefromNtp(addr='0.de.pool.ntp.org'):
REF_TIME_1970 = 2208988800 # Reference time
client = socket.socket( socket.AF_INET, socket.SOCK_DGRAM )
data = b'\x1b' + 47 * b'\0'
client.sendto( data, (addr, 123))
data, address = client.recvfrom( 1024 )
if data:
t = struct.unpack( '!12I', data )[10]
t -= REF_TIME_1970
return arrow.get(t)
print(RequestTimefromNtp())
The following function is working well using python 3:
def GetNTPDateTime(server):
try:
ntpDate = None
client = ntplib.NTPClient()
response = client.request(server, version=3)
ntpDate = ctime(response.tx_time)
print (ntpDate)
except Exception as e:
print (e)
return datetime.datetime.strptime(ntpDate, "%a %b %d %H:%M:%S %Y")
I used ntplib server and get date and change format in dd-mm-yyyy

Python Jupyter Notebook won't run my code, keeps reconnecting

How come this piece of code does not run properly on Jupyter Notebook.
It keeps reconnecting without any result. I try to make a database and scrape data as fast as possible from a webserver. I use threads to speed up the process and iterate over multiple url's (every different url represent a different day).
import pandas as pd
import datetime
import urllib
import requests
from pprint import pprint
import time
from io import StringIO
from multiprocessing import Process, Pool
symbols = ['AAP']
start = time.time()
dflist = []
def load(date):
if date is None:
return
url = "http://regsho.finra.org/FNYXshvol{}.txt".format(date)
try:
df = pd.read_csv(url,delimiter='|')
if any(df['Symbol'].isin(symbols)):
stocks = df[df['Symbol'].isin(symbols)]
print(stocks.to_string(index=False, header=False))
# Save stocks to mysql
else:
print(f'No stock found for {date}' )
except urllib.error.HTTPError:
pass
pool = []
numdays = 365
start_date = datetime.datetime(2019, 1, 15 ) #year - month - day
datelist = [
(start_date - datetime.timedelta(days=x)).strftime('%Y%m%d') for x in range(0, numdays)
]
pool = Pool(processes=16)
pool.map(load, datelist)
pool.close()
pool.join()
print(time.time() - start)
Would like to know how I can solve this and make it work

Scraping the metadata of 10,000 website is too slow (Python)

Hi all,
I'm trying to parse the metadata of 10,000 websites into a Pandas dataframe for an SEO / analytics application but the code is taking ages. I've been trying to do it on 1,000 websites and the code has been running for the last 3 hours (it works without problem on 10-50 websites).
Here's the sample data:
index site
0 http://www.google.com
1 http://www.youtube.com
2 http://www.facebook.com
3 http://www.cnn.com
... ...
10000 http://www.sony.com
Here's my Python (2.7) code:
# Importing dependencies
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import metadata_parser
# Loading the Pandas dataframe
df = pd.read_csv('final_urls')
# Utility functions
def meta(website, metadata):
full_url = website
parser = metadata_parser.MetadataParser(url=full_url)
if metadata == 'all':
return parser.metadata
else:
return parser.metadata[metadata]
def meta_all(website):
try:
result = meta(website, 'all')
except BaseException:
result = 'Exception'
return result
# Main
df['site'].apply(meta_all)
I'd like the code to be much faster. I've been using the metadata_parser library (https://github.com/jvanasco/metadata_parser) which relies heavily on requests and BeautifulSoup.
I understand I might be able to change the parser to lxml for the code to be faster. It's already installed on my machine so BeautifulSoup should use it as the primary choice.
Do you have any suggestion to get this code to run faster?
Thanks!
You can use Python Twisted (Twisted is an event-driven networking engine written in Python). You will need to install a few packages with pip, maybe twisted, pyopenssl and service_identity maybe others. This code works on Python 2.7 which you say you are using.
from twisted.internet import defer, reactor
from twisted.web.client import getPage
import metadata_parser
import pandas as pd
import numpy as np
from multiprocessing import Process
def pageCallback(result, url):
data = {
'content': result,
'url': url,
}
return data
def getPageData(url):
d = getPage(url)
d.addCallback(pageCallback, url)
return d
def listCallback(result):
for isSuccess, data in result:
if isSuccess:
print("Call to %s succeeded " % (data['url']))
parser = metadata_parser.MetadataParser(html=data['content'], search_head_only=False)
print(parser.metadata) # do something with it here
def finish(ign):
reactor.stop()
def start(urls):
data = []
for url in urls:
data.append(getPageData(url))
dl = defer.DeferredList(data)
dl.addCallback(listCallback)
dl.addCallback(finish)
def processStart(chunk):
start(chunk)
reactor.run()
df = pd.read_csv('final_urls')
urls = df['site'].values.tolist()
chunkCounter = 0
chunkLength = 1000
for chunk in np.array_split(urls,len(urls)/chunkLength):
p = Process(target=processStart, args=(chunk,))
p.start()
p.join()
chunkCounter += 1
print("Finished chunk %s of %s URLs" % (str(chunkCounter), str(chunkLength)))
I have run it on 10,000 URLs and it took less than 16 minutes.
Updated
Normally you would process the data you generated where I added the comment "# do something with it here". In the event you want the generated data returned back for processing you can do something like this (I have also updated to use treq.):
from twisted.internet import defer, reactor
import treq
import metadata_parser
import pandas as pd
import numpy as np
import multiprocessing
from twisted.python import log
import sys
# log.startLogging(sys.stdout)
results = []
def pageCallback(result, url):
content = result.content()
data = {
'content': content,
'url': url,
}
return data
def getPageData(url):
d = treq.get(url, timeout=60, headers={'User-Agent': ["Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv'\:'57.0) Gecko/20100101 Firefox/57.0"]})
d.addCallback(pageCallback, url)
return d
def listCallback(result):
global results
for isSuccess, data in result:
if isSuccess:
print("Call to %s succeeded " % (data['url']))
parser = metadata_parser.MetadataParser(html=str(data['content']), search_head_only=False)
# print(parser.metadata) # do something with it here
results.append((data['url'], parser.metadata))
def finish(ign):
reactor.stop()
def start(urls):
data = []
for url in urls:
data.append(getPageData(url))
dl = defer.DeferredList(data)
dl.addCallback(listCallback)
dl.addCallback(finish)
def processStart(chunk, returnList):
start(chunk)
reactor.run()
returnList.extend(results)
df = pd.read_csv('final_urls')
urls = df['site'].values.tolist()
chunkCounter = 0
chunkLength = 1000
manager = multiprocessing.Manager()
returnList = manager.list()
for chunk in np.array_split(urls,len(urls)/chunkLength):
p = multiprocessing.Process(target=processStart, args=(chunk,returnList))
p.start()
p.join()
chunkCounter += 1
print("Finished chunk %s of %s URLs" % (str(chunkCounter), str(chunkLength)))
for res in returnList:
print (res)
print (len(returnList))
You may also want to add some error handling, to help you can uncomment the line reading "log.startLogging(sys.stdout)" but this is too much detail for one answer. If you get some failures for URLs I would generally retry them by running the code again with just the failed URLs possibly a few times if necessary.

django time checker database

I am trying to create a thread function that allow me to check a database field in order to see if the time.now() is bigger than the one recorded in the database(postgresql); the problem is that the view.py where I am calling this, is blocked by this thread, this is my actual code:
PD: expire_pet is a text field, then I cast it to datetime.
import socket
import struct
from time import *
from datetime import datetime
from models import Zone
from multiprocessing import pool
import threading
class ControlHora(threading.Thread):
def __init__(self,zone_id):
threading.Thread.__init__(self)
self.zone_id = zone_id
def run(self):
while(True):
zone_pet = Zone.objects.filter(id = self.zone_id)
for i in zone_pet:
if i.pet_state == True:
hour = datetime.datetime.strptime(i.expire_pet, '%I:%M')
if hour <= datetime.datetime.now():
Zone.objects.filter(id = self.zone_id).update(vitrasa_pet = False)
Zone.objects.filter(id = self.zone_id).update(esycsa_pet = False)
Zone.objects.filter(id = self.zone_id).update(pet_state = False)
Zone.objects.filter(id = self.zone_id).update(expire_pet='')
sleep(5)
It works, the problem was that I have been calling the run in the wrong place, thanks

Categories