How come this piece of code does not run properly on Jupyter Notebook.
It keeps reconnecting without any result. I try to make a database and scrape data as fast as possible from a webserver. I use threads to speed up the process and iterate over multiple url's (every different url represent a different day).
import pandas as pd
import datetime
import urllib
import requests
from pprint import pprint
import time
from io import StringIO
from multiprocessing import Process, Pool
symbols = ['AAP']
start = time.time()
dflist = []
def load(date):
if date is None:
return
url = "http://regsho.finra.org/FNYXshvol{}.txt".format(date)
try:
df = pd.read_csv(url,delimiter='|')
if any(df['Symbol'].isin(symbols)):
stocks = df[df['Symbol'].isin(symbols)]
print(stocks.to_string(index=False, header=False))
# Save stocks to mysql
else:
print(f'No stock found for {date}' )
except urllib.error.HTTPError:
pass
pool = []
numdays = 365
start_date = datetime.datetime(2019, 1, 15 ) #year - month - day
datelist = [
(start_date - datetime.timedelta(days=x)).strftime('%Y%m%d') for x in range(0, numdays)
]
pool = Pool(processes=16)
pool.map(load, datelist)
pool.close()
pool.join()
print(time.time() - start)
Would like to know how I can solve this and make it work
Related
Every time I run the program below, the output keeps adding to previous outputs because the object_list is appending in the background since apscheduler is set to run on intervals. What my output needs to be is obtaining the real time up to date list of the objects in the bucket. Right now I get an appended list that includes objects that are no longer in the bucket because the appending list. When I run the program manually, I get the expected results because the list appends once and completes the process. Is there a way to run this program in the background and have a new appended list created each time the program produces the output? The program is using an exclude list to filter out unwanted results.
import boto3
from plyer import notification
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.schedulers.background import BlockingScheduler
from time import sleep
import datetime
import schedule
import time
Exclude_List= []
Object_List = []
FTP_File_List = []
file = open('ftp_exclude.txt', 'r')
excplist = file.readlines()
file.close
for x in excplist:
Exclude_List.append(x.strip())
def AWS_PROD_Check():
print(f"AWS_PROD START: {datetime.datetime.now()}")
session = boto3.Session(profile_name='My_Profile')
s3 = session.resource('s3')
my_bucket = s3.Bucket('my_bucket')
objects = my_bucket.objects.filter(Prefix = 'My_folder/')
for object in objects:
Object_List.append(object.key)
FTP_File_List = set(Object_List) - {x for y in Exclude_List for x in Object_List if y in x}
FTP_File_List_Sorted = sorted(FTP_File_List)
for x in FTP_File_List_Sorted:
if '/My_directory/' in x and '.' in x:
print(x)
print(f"AWS_PROD END: {datetime.datetime.now()}")
notification.notify(
title='AWS_PROD Check',
message='Report Generated',
app_icon=None,
timeout=20, )
AWS_PROD_Check()
sched = BackgroundScheduler()
sched.add_job(AWS_PROD_Check, 'interval', minutes = 5)
sched.start()
while True:
sleep(1)
I'm trying to develop a exercise evaluation chatbot.
I want to get accelerometer data of my iphone, I found there is some code for android, is there one for iphone?
https://smartphonedaq.com/accelerometer.page
import android
import time
droid = android.Android()
dt = 100 #100ms between sensings
endTime = 3000 #sample for 3000ms
timeSensed=0
droid.startSensingTimed(2,dt)
while timeSensed <= endTime:
print droid.sensorsReadAccelerometer().result
time.sleep(dt/1000.0)
timeSensed+=dt
droid.stopSensing()
I download 'Pyto' and write some code with 'motion' module, now I get the accelerometer data and I can count the squat and rise in 30 minutes.
Here is the code:
'''
import motion
from datetime import datetime, timedelta
import time
import csv
#import matplotlib.pyplot as plt
tm=[]
data_x=[]
data_y=[]
data_z=[]
combi=[]
data_dic=[]
Record_start=datetime.now()
Record_stop=datetime.now() + timedelta(seconds=30)
print('start time', Record_start)
#print(Record_stop)
for i in range(0, 300):
if datetime.now() < Record_stop:
now=datetime.now()
tm.append(now)
motion.start_updating()
motion.stop_updating()
A=motion.get_acceleration()
#data_x.append(A[0])
#data_y.append(A[1])
#data_z.append(A[2])
#tm.append(i)
combine=float(A[0])+float(A[1])+float(A[2])
C=[now, A[0],A[1],A[2], combine]
data_dic.append(C)
#combine=float(A[0])+float(A[1])+float(A[2])
combi.append(combine)
time.sleep(0.1)
#print(data_x)
#print(data_y)
#print(data_z)
#print(data_dic)
print('data no.', len(data_dic))
print('end time', Record_stop)
#plt.plot(time, data_z)
#plt.show
count=0
for n in range(0, len(combi)-1):
cri=-0.5
if combi[n] > cri:
if combi[n+1] < cri:
count+=1
print('you squat and rise', count, 'times.')
'''
But I do not know how to integrate with Line chatbot.
Is it possible to obtain accelerometer with Line chatbot directly?
Or send a command from server to user/client smartphone?
Sorry, just can not figure out how to complete this loop. Thx for all helpful comments.
i'm trying to do something that downloads a lot of file from a telegram channel
the code works well but it takes too long and above all that I have a slow internet connection
I have this code, I am downloading files that weigh 1gb but it takes a long time for an example to make the download faster?
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
import datetime
import os
def get_entity_data(entity_id, limit):
entity = client.get_entity(entity_id)
fecha = datetime.datetime.today()
today = fecha.day
yesterday = today - 1
posts = client(GetHistoryRequest(
peer=entity,
limit=limit,
offset_date=None,
offset_id=0,
max_id=0,
min_id=0,
add_offset=0,
hash=0))
for post in posts.messages:
post_day = post.date.day
if post_day >= yesterday:
if post.media is not None:
try:
file_name = post.media.document.attributes[0].file_name
except:
file_name = post.media.document.attributes[1].file_name
directorio = os.getcwd()+'/descargas'
if os.path.exists('descargas/'+file_name) == False:
print(file_name, 'Descargando...')
client.download_media(message=post, file=directorio)
print('Archivo descargado.')
I think you can handle it by fewer limit and set offset and using multithreaded requests, maybe pool package helps you in this approach.
for example, the limit parameter set to 10 and there exists 1000 id which you want to get, so the offset should be offset = [0, 10, 20, 30, ..., 1000]
then:
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
import datetime
import os
import pool
offsets = [0, 10, 20, 30, ..., 1000]
pool.map(get_entity_data, offsets)
def get_entity_data(entity_id={your_id}, limit=10, offset_id=0):
your function
Hi all,
I'm trying to parse the metadata of 10,000 websites into a Pandas dataframe for an SEO / analytics application but the code is taking ages. I've been trying to do it on 1,000 websites and the code has been running for the last 3 hours (it works without problem on 10-50 websites).
Here's the sample data:
index site
0 http://www.google.com
1 http://www.youtube.com
2 http://www.facebook.com
3 http://www.cnn.com
... ...
10000 http://www.sony.com
Here's my Python (2.7) code:
# Importing dependencies
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import metadata_parser
# Loading the Pandas dataframe
df = pd.read_csv('final_urls')
# Utility functions
def meta(website, metadata):
full_url = website
parser = metadata_parser.MetadataParser(url=full_url)
if metadata == 'all':
return parser.metadata
else:
return parser.metadata[metadata]
def meta_all(website):
try:
result = meta(website, 'all')
except BaseException:
result = 'Exception'
return result
# Main
df['site'].apply(meta_all)
I'd like the code to be much faster. I've been using the metadata_parser library (https://github.com/jvanasco/metadata_parser) which relies heavily on requests and BeautifulSoup.
I understand I might be able to change the parser to lxml for the code to be faster. It's already installed on my machine so BeautifulSoup should use it as the primary choice.
Do you have any suggestion to get this code to run faster?
Thanks!
You can use Python Twisted (Twisted is an event-driven networking engine written in Python). You will need to install a few packages with pip, maybe twisted, pyopenssl and service_identity maybe others. This code works on Python 2.7 which you say you are using.
from twisted.internet import defer, reactor
from twisted.web.client import getPage
import metadata_parser
import pandas as pd
import numpy as np
from multiprocessing import Process
def pageCallback(result, url):
data = {
'content': result,
'url': url,
}
return data
def getPageData(url):
d = getPage(url)
d.addCallback(pageCallback, url)
return d
def listCallback(result):
for isSuccess, data in result:
if isSuccess:
print("Call to %s succeeded " % (data['url']))
parser = metadata_parser.MetadataParser(html=data['content'], search_head_only=False)
print(parser.metadata) # do something with it here
def finish(ign):
reactor.stop()
def start(urls):
data = []
for url in urls:
data.append(getPageData(url))
dl = defer.DeferredList(data)
dl.addCallback(listCallback)
dl.addCallback(finish)
def processStart(chunk):
start(chunk)
reactor.run()
df = pd.read_csv('final_urls')
urls = df['site'].values.tolist()
chunkCounter = 0
chunkLength = 1000
for chunk in np.array_split(urls,len(urls)/chunkLength):
p = Process(target=processStart, args=(chunk,))
p.start()
p.join()
chunkCounter += 1
print("Finished chunk %s of %s URLs" % (str(chunkCounter), str(chunkLength)))
I have run it on 10,000 URLs and it took less than 16 minutes.
Updated
Normally you would process the data you generated where I added the comment "# do something with it here". In the event you want the generated data returned back for processing you can do something like this (I have also updated to use treq.):
from twisted.internet import defer, reactor
import treq
import metadata_parser
import pandas as pd
import numpy as np
import multiprocessing
from twisted.python import log
import sys
# log.startLogging(sys.stdout)
results = []
def pageCallback(result, url):
content = result.content()
data = {
'content': content,
'url': url,
}
return data
def getPageData(url):
d = treq.get(url, timeout=60, headers={'User-Agent': ["Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv'\:'57.0) Gecko/20100101 Firefox/57.0"]})
d.addCallback(pageCallback, url)
return d
def listCallback(result):
global results
for isSuccess, data in result:
if isSuccess:
print("Call to %s succeeded " % (data['url']))
parser = metadata_parser.MetadataParser(html=str(data['content']), search_head_only=False)
# print(parser.metadata) # do something with it here
results.append((data['url'], parser.metadata))
def finish(ign):
reactor.stop()
def start(urls):
data = []
for url in urls:
data.append(getPageData(url))
dl = defer.DeferredList(data)
dl.addCallback(listCallback)
dl.addCallback(finish)
def processStart(chunk, returnList):
start(chunk)
reactor.run()
returnList.extend(results)
df = pd.read_csv('final_urls')
urls = df['site'].values.tolist()
chunkCounter = 0
chunkLength = 1000
manager = multiprocessing.Manager()
returnList = manager.list()
for chunk in np.array_split(urls,len(urls)/chunkLength):
p = multiprocessing.Process(target=processStart, args=(chunk,returnList))
p.start()
p.join()
chunkCounter += 1
print("Finished chunk %s of %s URLs" % (str(chunkCounter), str(chunkLength)))
for res in returnList:
print (res)
print (len(returnList))
You may also want to add some error handling, to help you can uncomment the line reading "log.startLogging(sys.stdout)" but this is too much detail for one answer. If you get some failures for URLs I would generally retry them by running the code again with just the failed URLs possibly a few times if necessary.
I have a situation to call multiple requests in a scheduler job to check live user status for 1000 users at a time. But server limits maximum up to 50 users in each hit of an API request. So using following approach with for loop its taking around 66 seconds for 1000 users (i.e for 20 API calls).
from apscheduler.schedulers.blocking import BlockingScheduler
sched = BlockingScheduler()
def shcdulerjob():
"""
"""
uidlist = todays_userslist() #Get around 1000 users from table
#-- DIVIDE LIST BY GIVEN SIZE (here 50)
split_list = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
idlists = split_list(uidlist, 50) # SERVER MAX LIMIT - 50 ids/request
for idlist in idlists:
apiurl = some_server_url + "&ids="+str(idlist)
resp = requests.get(apiurl)
save_status(resp.json()) #-- Save status to db
if __name__ == "__main__":
sched.add_job(shcdulerjob, 'interval', minutes=10)
sched.start()
So,
Is there any workaround so that it should optimize the time required to fetch API?
Does Python- APScheduler provide any multiprocessing option to process such api requests in a single job?
You could try to apply python's Thread pool from the concurrent.futures module, if the server allows concurrent requests. That way you would parallelise the processing, instead of the scheduling itself
There are some good examples provided in the documentation here (If you're using python 2, there is a sort of an equivalent module
e.g.
import concurrent.futures
import multiprocessing
import requests
import time
import json
cpu_start_time = time.process_time()
clock_start_time = time.time()
queue = multiprocessing.Queue()
uri = "http://localhost:5000/data.json"
users = [str(user) for user in range(1, 50)]
with concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count()) as executor:
for user_id, result in zip(
[str(user) for user in range(1, 50)]
, executor.map(lambda x: requests.get(uri, params={id: x}).content, users)
):
queue.put((user_id, result))
while not queue.empty():
user_id, rs = queue.get()
print("User ", user_id, json.loads(rs.decode()))
cpu_end_time = time.process_time()
clock_end_time = time.time()
print("Took {0:.03}s [{1:.03}s]".format(cpu_end_time-cpu_start_time, clock_end_time-clock_start_time))
If you want to use a Process pool, just make sure you don't use shared resources, e.g. queue, and write your data our independently