Generalize Getting Data From SQL Server to Python - python

I'm studying on a task that I have to get data from SQL Server, and because I'm running time series analysis, I need to specify a date field that can change every table or query. Also I can read a simple query or a stored procedure. I want to generalize my below code which is a field and database specific. I thought that I can define an empty dictionary in class and then I can call it in below dataread method. But I am conflicted.
class DataPrep:
def __init__(self,conn):
self.df = pd.DataFrame()
self.mega_projects = set()
self.mega_project_to_df = {}
self.mega_project_to_df_pvt = {}
self.conn={}
def read_data(self):
self.conn=pyodbc.connect({'driver':None, 'server':None, 'database':None, 'uid':None, 'pwd':None})
self.df = pd.read_sql_query('''exec [dbo].[ML_WorkLoad]''', self.conn, parse_dates={'CreatedDate': '%d/%m/%Y %H.%M.%S'})
#self.df = self.df[['EstimateManDay', 'CreatedDate', 'MegaProject', 'ProjectName']]
self.df['month'] = pd.DatetimeIndex(self.df['CreatedDate']).month
self.df['year'] = pd.DatetimeIndex(self.df['CreatedDate']).year
self.df['quarter'] = pd.DatetimeIndex(self.df['CreatedDate']).quarter
self.df['week'] = pd.DatetimeIndex(self.df['CreatedDate']).week
self.df['dayorg'] = pd.DatetimeIndex(self.df['CreatedDate']).day
self.df['day'] = 1
self.df['year_quarter'] = self.df['year'].astype(str) + "_" + self.df[
'quarter'].astype(str)
self.df['year_month'] = self.df['year'].astype(str) + "_" + self.df[
'month'].astype(str)
self.df['year_week'] = self.df['year'].astype(str) + "_" + self.df['week'].astype(
str)
self.df['date'] = pd.to_datetime(self.df[['year', 'month', 'day']])
self.df = self.df[self.df['CreatedDate'] <= datetime.strptime("2020-01-01", "%Y-%m-%d")]

Related

pylucence cannot find a word that was presented in the text which indexed earlier

I use pylucence 9.4.1 to index a document and I just noticed a weird problem. There are some words, e.g. 'baby', that are present in the document but pylucene is unable to find them in the index.
This is my code to index the document:
(The document can be downloaded from here.
filepath = os.getcwd() + '/' + 'wiki_movie_plots_deduped.csv'
def indexDocument(title, year, plot):
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
doc = document.Document()
doc.add(document.Field("Title", title, ft))
doc.add(document.Field("Plot", plot, ft))
writer.addDocument(doc)
def CloseWriter():
writer.close()
def makeInvertedIndex(file_path):
df = pd.read_csv(file_path)
print(df.columns)
docid = 0
for i in df.index:
print(docid, '-', df['Title'][i])
indexDocument(df['Title'][i], df['Release Year'][i], df['Plot'][i])
docid += 1
indexPath = File('index/').toPath()
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
inverted = makeInvertedIndex(filepath)
CloseWriter()
This is the code to search the created index for a keyword:
keyword = 'baby'
fieldname = 'Title'
result = list()
indexPath = File('index/').toPath()
directory = FSDirectory.open(indexPath)
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
query = QueryParser(fieldname, analyzer).parse(keyword)
print('query', query)
numdocs = searcher.count(query)
print("#-docs:", numdocs)
searcher.setSimilarity(BM25Similarity(1.2,0.75))
scoreDocs = searcher.search(query, 1000).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
# scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 100).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
Any help to understand the problem is appreciated.

How do I insert into a Table one Primary Key and Two Foreign Keys?

I work with Rolls of plastic film in different legnth and width. And I'm creating a Database to store all the orders, and, in order to avoid repetition, I created separate tables for length(class(Comprimento)) and width(class(Largura)). I used UUID to create distinct ID's.
Now, I want to cross both tables in a Model class. Which is:
class Largura(Base):
__tablename__ = 'largura'
id = Column(GUID(), primary_key=True, default=lambda: str(uuid.uuid4()))
largura = Column(String)
modelos_l = relationship('Modelo', back_populates='larguras', cascade='all, delete')
def __repr__(self):
return f"<Largura {self.largura}>"
class Comprimento(Base):
__tablename__ = 'comprimento'
id = Column(GUID(), primary_key=True, default=lambda: str(uuid.uuid4()))
comprimento = Column(String)
modelos_c = relationship('Modelo', back_populates='comprimentos', cascade='all, delete')
def __repr__(self):
return f"<Comprimento {self.comprimento}>"
class Modelo(Base):
__tablename__ = 'modelo'
id = Column(GUID(), primary_key=True, default=lambda: str(uuid.uuid4()))
descricao = Column(String(50))
largura_id = Column(GUID(), ForeignKey("largura.id"), default=lambda: str(uuid.uuid4()))
comprimento_id = Column(GUID(), ForeignKey("comprimento.id"), default=lambda: str(uuid.uuid4()))
larguras = relationship('Largura', back_populates='modelos_l')
comprimentos = relationship('Comprimento', back_populates='modelos_c')
def __repr__(self):
return f"<Modelo {self.id}>"
Then, i created a file dedicated to my data insert on this table:
from DBModelPy3 import Comprimento,Largura,Modelo,session
from sqlalchemy import create_engine
import pandas as pd
#Pre Loading my CSV file
df = pd.read_csv("dataorged.csv", sep=',')
pd.set_option('display.float_format','{:.0f}'.format) #change the number format to hide the ','
cnx = create_engine('sqlite:///data_hub2.db', echo=True).connect()
df_modelo = df[['larg_ajustada', 'comp']] # My dataframe that contains the orders. I chose the specifics columns needed for this insertion.
#print(df_modelo)
# Loading the Tables from my database
df_largura = pd.read_sql_table('largura', cnx)
df_comprimento = pd.read_sql_table('comprimento', cnx)
With everything loaded I decided to combine all the legnths and widths i had already on my two tables (df_largura and df_comprimento), and then filtered using the original file which contains the orders.
# COMBINING ALL THE LENGTH AND WIDTH OF MY TABLES
model_num = []
for n_larg in range(len(df_largura)):
db_larg = str(df_largura['largura'][n_larg])
for n_comp in range(len(df_comprimento)):
db_comp = df_comprimento['comprimento'][n_comp]
combined = str(db_larg) + "x" + str(db_comp)
model_num.append([db_larg,db_comp,combined])
df_modelos_ex = pd.DataFrame(model_num)
df_modelos_ex.columns = ['larg','comp','combined']
With these, i had all possible combinations on my dataframe.
And created the combined variable to match later
modelos_existentes = []
# COMBINATIONS THAT APPEAR IN THE ORDER DATAFRAME #
for item in range(len(df_modelo)):
mod_larg = df_modelo['larg_ajustada'][item]
mod_comp = df_modelo['comp'][item]
mod_comb = str(mod_larg) + "x" + str(mod_comp)
modelos_existentes.append([mod_larg,mod_comp,mod_comb])
df_mod_existentes = pd.DataFrame(modelos_existentes)
df_mod_existentes.columns = ['ex_larg','ex_comp','ex_comb']
df_limpo = df_mod_existentes.drop_duplicates(subset=['ex_comb'])
df_limpo.reset_index(drop=True, inplace=True)
With all my elements, then the madness began.
I started a loop to run through all my Dataframes:
for l_row in range(len(df_limpo)): # For Each Row in my dataframe which contains the orders,
larg = df_limpo['ex_larg'][l_row] # create variable for width
comp = df_limpo['ex_comp'][l_row] # create variable for lenght
comb = df_limpo['ex_comb'][l_row] # create variable for combination of both
for n_row in range(len(df_largura)): # For each row in my width table from DB,
db_larg_id = df_largura['id'][n_row] # I create a Variable for the PK from width
db_larg_largura = df_largura['largura'][n_row] # Create a Variable with the value
lar = session.query(Largura).filter(Largura.id == db_larg_id).first()
if db_larg_largura == larg: # If the value on my table matches the value of the row in the order,
for m_row in range(len(df_comprimento)): # For each length in my table on the DB,
db_comp_id = df_comprimento['id'][m_row]
db_comp_comprimento = df_comprimento['comprimento'][m_row]
compr = session.query(Comprimento).filter(Comprimento.id == db_comp_id).first()
if db_comp_comprimento == comp: # If the value on my table matches the value of the row in the order
new_model = Modelo(descricao=df_limpo['ex_comb'][n_linha], larguras=lar, comprimentos=compr)
from here, i would only add the session.add(new_model) and session.commit() to finish my code.
But it's not adding.
What I would like is for my Modelo table be like:
MODELO Table
ID(PK) | DESCRIPTION (Combined values String) | Largura_id (width_id, FK) | Comprimento_id (length_id, FK)
Sorry about the long explanation. Tried my best!
If anyone have the same trouble:
##########################
# ADDING TO THE DATABANK #
##########################
lista_a = [] #Created an empty List
for n_linha in range(len(df_limpo)): #Ran through my dataframe
larg_a = df_limpo['ex_larg'][n_linha] #Extracted width and length from it
comp_a = df_limpo['ex_comp'][n_linha]
for m_linha in range(len(df_largura)): #Ran my width table from database
db_larg_id = df_largura['id'][m_linha]
db_larg_largura = df_largura['largura'][m_linha]
if larg_a == db_larg_largura: #Checked if the width from my dataframe matches the one on the table
lista_a.append([larg_a,comp_a,db_larg_id]) #appended to the list_a
#print(lista_a)
df_lista_a = pd.DataFrame(lista_a) #Created a new Dataframe
df_lista_a.columns = ['larg','comp','id_larg']
lista_b = [] #Created a new list
for n_row in range(len(df_lista_a)): #Ran through my new dataframe
larg_b = df_lista_a['larg'][n_row] #Extracted each column from it
comp_b = df_lista_a['comp'][n_row]
larg_b_id = df_lista_a['id_larg'][n_row]
#df_limpo_lrow = df_limpo['ex_larg'][n_row]
#df_limpo_crow = df_limpo['ex_comp'][n_row]
#df_limpo_cbrow = df_limpo['ex_comb'][n_row]
#print(larg_b,comp_b,larg_b_id,n_row)
for m_row in range(len(df_comprimento)): #Ran through my lenght table
db_comp_id = df_comprimento['id'][m_row]
db_comp_comprimento = df_comprimento['comprimento'][m_row]
if comp_b == db_comp_comprimento: #Check if the lenght from dataframe matches the lenght on my table on the database
#print(larg_b,comp_b,n_row,m_row,df_limpo_lrow)
lista_b.append([larg_b,comp_b,larg_b_id,db_comp_id]) #appended the lenght id to my list
break
#print(lista_b)
#print(len(df_lista_a),len(df_limpo),len(lista_b))
df_lista_b = pd.DataFrame(lista_b) #converted to Dataframe.
df_lista_b.columns = ['larg','comp','id_larg','id_comp']
# HERE's the ACTUAL INSERTION
for n_model in range(len(df_lista_b)): #For each model found on the list, extract the values and add to new_model.
mod_largura = df_lista_b['larg'][n_model]
mod_comprimento = df_lista_b['comp'][n_model]
mod_largura_id = df_lista_b['id_larg'][n_model]
mod_comprimento_id = df_lista_b['id_comp'][n_model]
lar = session.query(Largura).filter(Largura.id == df_largura['id'][1]).first()
compr = session.query(Comprimento).filter(Comprimento.id == df_comprimento['id'][1]).first()
new_model = Modelo(descricao=df_limpo['ex_comb'][n_model], larguras=lar, comprimentos=compr)
print("Modelo: " + df_limpo['ex_comb'][n_model] + " com Id's " + mod_largura_id + " e " + mod_comprimento_id + " adicionados!")
session.add(new_model)
session.commit()
Then it's done.

can't adapt type 'data'

I have data class:
class data:
def __init__(self, ReadTime, Concentration_PM10, Concentration_SO2, Concentration_O3, Concentration_NO2, Concentration_CO, AQI_PM10,
AQI_SO2,AQI_O3, AQI_NO2, AQI_CO, AQI_AQIIndex, AQI_ContaminantParameter, AQI_State, AQI_Color ):
self.ReadTime = ReadTime
self.Concentration_PM10 = Concentration_PM10
self.Concentration_SO2 = Concentration_SO2
self.Concentration_O3 = Concentration_O3
self.Concentration_NO2 = Concentration_NO2
self.Concentration_CO = Concentration_CO
self.AQI_PM10 = AQI_PM10
self.AQI_SO2 = AQI_SO2
self.AQI_O3 = AQI_O3
self.AQI_NO2 = AQI_NO2
self.AQI_CO = AQI_CO
self.AQI_AQIIndex = AQI_AQIIndex
self.AQI_ContaminantParameter = AQI_ContaminantParameter
self.AQI_State = AQI_State
self.AQI_Color = AQI_Color
I'm sending a request to an api and populating the variables into a list.:
list = []
for i in result:
list.append( data(i['ReadTime'], i['Concentration']['PM10'], i['Concentration']['SO2'],i['Concentration']['O3'],
i['Concentration']['NO2'],i['Concentration']['CO'],i['AQI']['PM10'],
i['AQI']['SO2'],i['AQI']['O3'],i['AQI']['NO2'],i['AQI']['CO'],i['AQI']['AQIIndex'],i['AQI']['ContaminantParameter'],
i['AQI']['State'],i['AQI']['Color']))
then I want to insert this list into a table in PostgreSQL but I get error "can't adapt type 'data'"
list_record = ", ".join(["%s"] * len(list))
query_insert= (f"INSERT INTO hava_kalitesi (ReadTime, Concentration_PM10, Concentration_SO2, Concentration_O3, Concentration_NO2, Concentration_CO, AQI_PM10, AQI_SO2, AQI_O3, AQI_NO2, AQI_CO, AQI_AQIIndex, AQI_ContaminantParameter,AQI_State,AQI_Color) VALUES {list_record}"
)
cursor.execute(query_insert,list)

How to get big amount of data as fast as possible

I am trying to return an array of constructed objects that are build on top of objects that I retrieve from some url plus another fields that I get from another url.
I have an array that consists of two arrays that each has about 8000 objects...
I have tried to make each object construction as a thread however it still takes a lot of time...
Any solution? Here is my code:
def get_all_players_full_data(ea_players_json):
all = []
ea_players_json = list(ea_players_json.values())
for i in range(len(ea_players_json)):
for player_obj in ea_players_json[i]:
all.append(player_obj)
for player_obj in range(len(all)):
all_data = []
with concurrent.futures.ThreadPoolExecutor(len(all)) as executor:
for player_data in all:
future = executor.submit(build_full_player_data_obj, player_data)
print(future.result())
all_data.append(future.result())
def build_full_player_data_obj(ea_player_data):
if ea_player_data.get("c") is not None:
player_full_name = ea_player_data.get("c")
else:
player_full_name = ea_player_data.get("f") + " " + ea_player_data.get("l")
player_id = ea_player_data.get("id")
# go to futhead to find all cards of that player
futhead_url_player_data = f'{FUTHEAD_PLAYER}{player_full_name}'
details_of_specific_player = json.loads(requests.get(futhead_url_player_data).content)
cards_from_the_same_id = []
for player_in_json_futhead in details_of_specific_player:
if player_in_json_futhead["player_id"] == player_id:
rating = player_in_json_futhead["rating"]
specific_card_id = player_in_json_futhead["def_id"]
revision = player_in_json_futhead["revision_type"]
name = player_in_json_futhead["full_name"]
nation = player_in_json_futhead["nation_name"]
position = player_in_json_futhead["position"]
club = player_in_json_futhead["club_name"]
cards_from_the_same_id.append(Player(specific_card_id, name, rating, revision, nation,
position, club))
return cards_from_the_same_id

Google Sheets API - Formatting inserted values

Through this code I've update a bunch of rows in Google Spreadsheet.
The request goes well and returns me the updatedRange below.
result = service.spreadsheets().values().append(
spreadsheetId=spreadsheetId,
range=rangeName,
valueInputOption="RAW",
insertDataOption="INSERT_ROWS",
body=body
).execute()
print(result)
print("Range updated")
updateRange = result['updates']['updatedRange']
Now I would like to do a batchUpdate request to set the formatting or set a protected range, but those API require a range specified as startRowIndex, endRowIndex and so on.
How could I retrieve the rows index from the updatedRange?
Waiting for a native or better answer, I'll post a function I've created to translate a namedRange into a gridRange.
The function is far from perfect and does not translate the sheet name to a sheet id (I left that task to another specific function), but accept named ranges in the form:
sheet!A:B
sheet!A1:B
sheet!A:B5
sheet!A1:B5
Here is the code
import re
def namedRange2Grid(self, rangeName):
ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
match = re.match(".*?\!([A-Z0-9]+)\:([A-Z0-9]+)", rangeName)
if match:
start = match.group(1)
end = match.group(2)
matchStart = re.match("([A-Z]{1,})([1-9]+){0,}", start)
matchEnd = re.match("([A-Z]{1,})([1-9]+){0,}", end)
if matchStart and matchEnd:
GridRange = {}
letterStart = matchStart.group(1)
letterEnd = matchEnd.group(1)
if matchStart.group(2):
numberStart = int(matchStart.group(2))
GridRange['startRowIndex'] = numberStart - 1
if matchEnd.group(2):
numberEnd = int(matchEnd.group(2))
GridRange['endRowIndex'] = numberEnd
i = 0
for l in range(0, len(letterStart)):
i = i + (l * len(ascii_uppercase))
i = i + ascii_uppercase.index(letterStart[l])
GridRange['startColumnIndex'] = i
i = 0
for l in range(0, len(letterEnd)):
i = i + (l * len(ascii_uppercase))
i = i + ascii_uppercase.index(letterEnd[l])
GridRange['endColumnIndex'] = i + 1
return GridRange

Categories