WebScrapping running automatically 2 times a day - Python (selenium) - python

I've created a simple webScrapper that gets some information from CNN website and puts it into a database table.
Its working properly in Python and I'm using VScode.
I am looking for a way to run this script 2 times a day automatically, anyone knows how to do it? I tried AWS but I was not able to do it!
I want to run the code automatically online, with my computer off and it has to update my CSV file.
Some important information:
Considering that it is a webScrapper I have some files that I have to use in my folders such as chromedriver.exe and a CSV that append the new roll with new information.
Here is my code:
imports:
import pandas as pd
from datetime import datetime
import requests
import json
from pandas_datareader import data as web
import yfinance as yf
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep
from datetime import date, timedelta
from selenium.webdriver.chrome.options import Options
import pyodbc
WebScrapping code:
dataset.to_csv(r"C:\Users\belig\OneDrive\Python\MeuProjeto\Projetos\WebScrapping_News\WebScrapping_News\dataset.csv", index=False)
dataset = pd.read_csv(r"C:\Users\belig\OneDrive\Python\MeuProjeto\Projetos\WebScrapping_News\WebScrapping_News\dataset_news.csv", sep=";")
# Creating Variables
# %%
Date = 1
WeekDay = 2
Brazil_Ibovespa = 3
BRL_Dollar = 4
Titulo_CNNBrasil = 5
# Setup Date Var
Date = datetime.now().strftime("%d/%m/%Y, %H:%M:%S")
Date
# Setup WeekDay Var
date_now = datetime.now()
WeekDay = date_now.strftime("%A")
WeekDay
# Setup Brazil_Ibovespa Var
today = date.today()
start_day = today - timedelta(days = 7)
tickers_DowJones = "^BVSP"
datayf = yf.download(tickers_DowJones, start=start_day, end=today)
print(datayf)
datayf = datayf['Adj Close']
Brazil_Ibovespa = datayf[-1]
Brazil_Ibovespa
# Setup BRL_Dollar Var
requisicao = requests.get('https://economia.awesomeapi.com.br/all/USD-BRL')
cotacao = requisicao.json()
BRL_Dollar = round(float(cotacao['USD']['bid']),2)
BRL_Dollar
# Starting Driver WebScrapping (option to hide windown)
driver_exe = r'C:\Users\belig\OneDrive\Python\MeuProjeto\Projetos\WebScrapping_News\WebScrapping_News\chromedriver.exe'
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(driver_exe, options=options)
# Setup Titulo_CNNBrasil Var
driver.get('https://www.cnnbrasil.com.br/')
Titulo_CNNBrasil = driver.find_element(By.XPATH, '//*[#id="block1847327"]/div/div/a/h2').text
print(Titulo_CNNBrasil)
# Setup Url_CNNBrasil Var
driver.find_element(By.XPATH, '//*[#id="block1847327"]/div/div/a/h2').click()
Url_CNNBrasil = driver.current_url
print(Url_CNNBrasil)
# Setup Topics_CNNBrasil Var
try:
Topics_CNNBrasil = driver.find_element(By.CLASS_NAME, 'tags__list').text
Topics_CNNBrasil = Topics_CNNBrasil.replace('\n', ', ')
print(Topics_CNNBrasil)
except:
Topics_CNNBrasil = 'None'
print
Add to SQL and DataFrame:
# Add Row to DataFrame
new_row = pd.DataFrame({"Date":[Date], "WeekDay":[WeekDay], "Brazil_Ibovespa":[Brazil_Ibovespa], "BRL_Dollar":[BRL_Dollar], "Titulo_CNNBrasil":[Titulo_CNNBrasil], "Url_CNNBrasil":[Url_CNNBrasil], "Topics_CNNBrasil":[Topics_CNNBrasil], index=[0])
print(new_row)
dataset = pd.concat([dataset, new_row], ignore_index=True)
# dataset = dataset.append({"Date":Date, "WeekDay": WeekDay}, ignore_index=True)
print(dataset)
dataset.to_csv(r'C:\Users\belig\OneDrive\Python\MeuProjeto\Projetos\WebScrapping_News\WebScrapping_News\dataset_news.csv', index=False, encoding="utf-8-sig", sep = ';')
# Add info to SQL Server
dados_conexao = (
"Driver={SQL Server};"
"Server=Beligolli;"
"Database=WebScrappingNews;"
'Trusted_Connection=yes;'
# UID = Login;
# PWD=Senha;
)
conexao = pyodbc.connect(dados_conexao)
cursor = conexao.cursor()
comando = "INSERT INTO NewsDataBase (Date_Hour, WeekDay_, Brazil_Ibovespa, BRL_Dollar, Titulo_CNNBrasil, Url_CNNBrasil, Topics_CNNBrasil VALUES (?, ?, ?, ?, ?, ?, ?)"
valores = (Date, WeekDay, Brazil_Ibovespa, BRL_Dollar, Titulo_CNNBrasil, Url_CNNBrasil, Topics_CNNBrasil)
cursor.execute(comando, valores)
cursor.commit()
cursor.close()
conexao.close()
print(f'Adicionado {Date} - {WeekDay} ao dataset')

First i start with this library to schedule the events:
import schedule
import time
def job():
print("I'm working...")
schedule.every(10).minutes.do(job)
schedule.every().hour.do(job)
schedule.every().day.at("10:30").do(job)
while 1:
schedule.run_pending()
time.sleep(1)
and then save the script and if you want to run the app as a windows service

Related

Linux NoHup fails for Streaming API IG Markets where file is python

This is quite a specific question regarding nohup in linux, which runs a python file.
Back-story, I am trying to save down streaming data (from IG markets broadcast signal). And, as I am trying to run it via a remote-server (so I don't have to keep my own local desktop up 24/7),
somehow, the nohup will not engage when it 'listen's to a broadcast signal.
Below, is the example python code
#!/usr/bin/env python
#-*- coding:utf-8 -*-
"""
IG Markets Stream API sample with Python
"""
user_ = 'xxx'
password_ = 'xxx'
api_key_ = 'xxx' # this is the 1st api key
account_ = 'xxx'
acc_type_ = 'xxx'
fileLoc = 'marketdata_IG_spx_5min.csv'
list_ = ["CHART:IX.D.SPTRD.DAILY.IP:5MINUTE"]
fields_ = ["UTM", "LTV", "TTV", "BID_OPEN", "BID_HIGH", \
"BID_LOW", "BID_CLOSE",]
import time
import sys
import traceback
import logging
import warnings
warnings.filterwarnings('ignore')
from trading_ig import (IGService, IGStreamService)
from trading_ig.lightstreamer import Subscription
cols_ = ['timestamp', 'data']
# A simple function acting as a Subscription listener
def on_prices_update(item_update):
# print("price: %s " % item_update)
print("xxxxxxxx
))
# A simple function acting as a Subscription listener
def on_charts_update(item_update):
# print("price: %s " % item_update)
print(xxxxxx"\
.format(
stock_name=item_update["name"], **item_update["values"]
))
res_ = [xxxxx"\
.format(
stock_name=item_update["name"], **item_update["values"]
).split(' '))]
# display(pd.DataFrame(res_))
try:
data_ = pd.read_csv(fileLoc)[cols_]
data_ = data_.append(pd.DataFrame(res_, columns = cols_))
data_.to_csv(fileLoc)
print('there is data and we are reading it')
# display(data_)
except:
pd.DataFrame(res_, columns = cols_).to_csv(fileLoc)
print('there is no data and we are saving first time')
time.sleep(60) # sleep for 1 min
def main():
logging.basicConfig(level=logging.INFO)
# logging.basicConfig(level=logging.DEBUG)
ig_service = IGService(
user_, password_, api_key_, acc_type_
)
ig_stream_service = IGStreamService(ig_service)
ig_session = ig_stream_service.create_session()
accountId = account_
################ my code to set sleep function to sleep/read at only certain time intervals
s_time = time.time()
############################
# Making a new Subscription in MERGE mode
subscription_prices = Subscription(
mode="MERGE",
# make sure to put L1 in front of the instrument name
items= list_,
fields= fields_
)
# adapter="QUOTE_ADAPTER")
# Adding the "on_price_update" function to Subscription
subscription_prices.addlistener(on_charts_update)
# Registering the Subscription
sub_key_prices = ig_stream_service.ls_client.subscribe(subscription_prices)
print('this is the line here')
input("{0:-^80}\n".format("HIT CR TO UNSUBSCRIBE AND DISCONNECT FROM \
LIGHTSTREAMER"))
# Disconnecting
ig_stream_service.disconnect()
if __name__ == '__main__':
main()
#######
Then, I try to run it on linux using this command : nohup python marketdata.py
where marketdata.py is basically the python code above.
Somehow, the nohup will not engage....... Any experts/guru who might see what I am missing in my code?

How can I refresh the data in the background of a running flask app?

I have a simple flask app that queries a database to write a csv then pyplot to create a chart out of that.
I would like to refresh the data in the background every 10 minutes while the app is running. The page doesn't need to refresh the html automatically. It just needs to have fresh data when someone opens the page.
Can I do that in a single script? Or do I need to run a different script outside in crontab or something?
I would just kick over the container every 10 minutes but it takes about 5 minutes to get the query, so that's a 5 minute outage. Not a great idea. I'd prefer it to fetch in the background.
Here is what I'm working with:
import os
from datetime import date
import teradatasql
import pandas as pd
import matplotlib.pyplot as plt
from flask import Flask, render_template
import time
import multitasking
### variables
ausername = os.environ.get('dbuser')
apassword = os.environ.get('dbpassword')
ahost = os.environ.get('dbserver')
systems = ["prd1", "prd2", "frz1", "frz2", "devl"]
qgsystems = ["", "#Tera_Prd2_v2", "#Tera_Frz1_v2", "#Tera_Frz2_v2", "#Tera_Devl_v2"]
weeks = ["0", "7", "30"]
query = """{{fn teradata_write_csv({system}_{week}_output.csv)}}select (bdi.infodata) as sysname,
to_char (thedate, 'MM/DD' ) || ' ' || Cast (thetime as varchar(11)) as Logtime,
sum(drc.cpuuexec)/sum(drc.secs) (decimal(7,2)) as "User CPU",
sum(drc.cpuuserv)/sum(drc.secs) (decimal(7,2)) as "System CPU",
sum(drc.cpuiowait)/sum(drc.secs) (decimal(7,2)) as "CPU IO Wait"
from dbc.resusagescpu{qgsystem} as drc
left outer join boeing_tables.dbcinfotbl{qgsystem} as bdi
on bdi.infokey = 'sysname'
where drc.thedate >= (current_date - {week})
order by logtime asc
Group by sysname,logtime
;
"""
### functions
#multitasking.task
def fetch(system,qgsystem,week):
with teradatasql.connect (host=ahost, user=ausername, password=apassword) as con:
with con.cursor () as cur:
cur.execute (query.format(system=system, qgsystem=qgsystem, week=week))
[ print (row) for row in cur.fetchall () ]
#multitasking.task
def plot(system,week):
for week in weeks:
for system in systems:
df = pd.read_csv(system + "_" + week + "_output.csv")
df.pop('sysname')
df.plot.area(x="Logtime")
figure = plt.gcf()
figure.set_size_inches(12, 6)
plt.savefig( "/app/static/" + system + "_" + week + "_webchart.png", bbox_inches='tight', dpi=100)
### main
for week in weeks:
for system, qgsystem in zip(systems, qgsystems):
fetch(system,qgsystem,week)
for week in weeks:
for system in systems:
plot(system,week)
app = Flask(__name__,template_folder='templates')
#app.route('/')
def index():
return render_template("index.html")

encoding issue when exporting from dataframe in python to MS access

Dears,
i have a python script that have a query that reads from a DB , store the result in a Dataframe , then export it to MS access.
in the loop , it divides the result into 3 files ( each file has different month ) .
the issue in the column : LI_DESC , it have Arabic letter that shows correctly in jupyter , but it shows incorrect char when exported to access .
here is the columns showing correctly in jupyter:
here is the columns shown in access file:
python code:
import cx_Oracle
import os
import accessdb
import pandas as pd
dsn_tns = cx_Oracle.makedsn('10.112.**.****', '1521', service_name='cdwn10g.hq')
conn = cx_Oracle.connect(user='BI', password='BI', dsn=dsn_tns , encoding='utf-8')
sql_query= pd.read_sql_query("""SELECT MONTH1,LI_DESC,PORT,REGS_NUM,REG_DT,CTRY_CD,TAR_CD,UNS_QTY,UN_CD,KGN,KGG,CIF_AMT,CURCY_CD,CURCY_RT
FROM STTS.CDS
WHERE SUBSTR(REG_DT_G,1,6) BETWEEN to_number(extract(year from add_months(sysdate,-3)) || '' || to_char(add_months(sysdate,-3), 'MM')) AND to_number(extract(year from add_months(sysdate,-1)) || '' || to_char(add_months(sysdate,-1), 'MM'))
ORDER BY PORT, REGS_NUM, REG_DT""",conn)
df = pd.DataFrame(sql_query)
from datetime import datetime
today = datetime.now()
if not os.path.exists(r'C:\Users\nalkar\Documents\Python Scripts\RUNDATE'+today.strftime('%Y%m%d')) :
os.makedirs(r'C:\Users\nalkar\Documents\Python Scripts\RUNDATE'+today.strftime('%Y%m%d'))
months= df['MONTH1'].unique().tolist()
for month in months:
mydf=df.loc[df.MONTH1 == month]
mydf.to_accessdb(r"C:\Users\nalkar\Documents\Python Scripts\RUNDATE"+today.strftime('%Y%m%d')+"\%s.accdb" %month, "Data")
print('done')
else:
print(r'directory already exist')

Final piece of code missing for fundamental data extract from TWS IB

I took the below code from one of the answered queries in Stackoverflow (unfortunately i cannot give full credit as i cannot locate the page anymore). I changed it a bit to fit my purpose.
I want to extract historical Reuters data (fundamentalData) for a list of tickers. The below code works fine but it only grabs the last ticker data. I know i need to build a while loop but i tried many times and none worked. I'm sure this is a quick fix but since I am new at coding and python in general I just can't find the solution. Any help would be appreciated!
#Import all libriaries
from ib.opt import ibConnection, message
from time import sleep
import lxml.etree
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from IPython.core.debugger import set_trace
from ibapi import wrapper
from ibapi.client import EClient
from ibapi.common import *
from ibapi.contract import Contract
#upload excel file of list of company tickers you want to review
us_list= pd.read_excel(r'C:\TWS API\Request Reuters data\Quant Project IB TWS_test.xlsx', engine='openpyxl')
stocksList = us_list[['TICKER']]
stocksList
def fundamentalData_handler(msg):
global imported
imported = msg.data
def error_handler(msg):
pass
# processing of the lines in financial statements
def extractVauleFromLineItem(fiscalperiod, code):
stage1 = fiscalperiod.find(name='lineitem', coacode=code)
if (not stage1 is None):
stage2 = stage1.get_text()
if (stage2 == stage2):
stage3 = float(stage2)
if (stage3 == stage3):
return (stage3)
else:
return (0.0)
result = pd.DataFrame(columns =['Year', 'Ticker','control','TotalRevenue', 'GrossProfit', 'CommonSharesOutstanding','DilutedNormalizedEPS', 'totalCash', 'TotalDebt','Dividends'])
outcomes = []
for i, row in stocksList.iterrows():
contract = Contract()
contract.symbol = row['TICKER']
contract.secType = "STK"
contract.currency = "USD"
contract.exchange = "SMART"
tws = ibConnection("127.0.0.1",port=7497, clientId=901)
tws.register(error_handler, message.Error)
tws.register(fundamentalData_handler, message.fundamentalData)
tws.connect()
tws.reqFundamentalData(1,contract,'ReportsFinStatements')
sleep(1)
tws.disconnect()
print(contract.symbol)
soup = BeautifulSoup(imported) # library for processing of the obtained XML data
data = []
print(soup.find(name='issueid', type="Ticker").get_text())
print(soup.find(name='coid', type="CompanyName").get_text())
# I found that IB API is not very stable.
# Sometimes it returns data of the wrong company.
# So the control is important
print('Control -',contract.symbol == soup.find(name='issueid', type="Ticker").get_text())
print()
for fiscalperiod in soup.find_all(name="fiscalperiod", type="Annual"):
year = fiscalperiod['fiscalyear']
TotalRevenue = extractVauleFromLineItem(fiscalperiod, 'RTLR')
GrossProfit = extractVauleFromLineItem(fiscalperiod, 'SGRP')
CommonSharesOutstanding = extractVauleFromLineItem(fiscalperiod, 'QTCO')
DilutedNormalizedEPS = extractVauleFromLineItem(fiscalperiod, 'VDES')
totalCash = extractVauleFromLineItem(fiscalperiod, 'OTLO')
TotalDebt = extractVauleFromLineItem(fiscalperiod, 'STLD')
Dividends = extractVauleFromLineItem(fiscalperiod, 'FCDP')
thisYearData = (year,contract.symbol, (contract.symbol == soup.find(name='issueid', type="Ticker").get_text()),TotalRevenue , GrossProfit, CommonSharesOutstanding, totalCash, TotalDebt, Dividends)
data.append(thisYearData)
df_data = pd.DataFrame(data, columns =['Year','control','TotalRevenue', 'GrossProfit', 'CommonSharesOutstanding','DilutedNormalizedEPS', 'totalCash', 'TotalDebt','Dividends'])
df_data = df_data.sort_values(by=['Year'])

Python Jupyter Notebook won't run my code, keeps reconnecting

How come this piece of code does not run properly on Jupyter Notebook.
It keeps reconnecting without any result. I try to make a database and scrape data as fast as possible from a webserver. I use threads to speed up the process and iterate over multiple url's (every different url represent a different day).
import pandas as pd
import datetime
import urllib
import requests
from pprint import pprint
import time
from io import StringIO
from multiprocessing import Process, Pool
symbols = ['AAP']
start = time.time()
dflist = []
def load(date):
if date is None:
return
url = "http://regsho.finra.org/FNYXshvol{}.txt".format(date)
try:
df = pd.read_csv(url,delimiter='|')
if any(df['Symbol'].isin(symbols)):
stocks = df[df['Symbol'].isin(symbols)]
print(stocks.to_string(index=False, header=False))
# Save stocks to mysql
else:
print(f'No stock found for {date}' )
except urllib.error.HTTPError:
pass
pool = []
numdays = 365
start_date = datetime.datetime(2019, 1, 15 ) #year - month - day
datelist = [
(start_date - datetime.timedelta(days=x)).strftime('%Y%m%d') for x in range(0, numdays)
]
pool = Pool(processes=16)
pool.map(load, datelist)
pool.close()
pool.join()
print(time.time() - start)
Would like to know how I can solve this and make it work

Categories