I'm displaying already available data from csv file in a GUI.
If I update some values in the GUI, i want them to be updated in the csv file as well.
def onClickedSaveReturn(self):
"""closes GUI and returns to calling (main) GUI"""
df_general = Clean.get_GeneralData()
# First of all, read general data so that pre-/intra- and postoperative share these
df_subj = {k: '' for k in Content.extract_saved_data(self.date).keys()} # create empty dictionary
df_subj['ID'] = General.read_current_subj().id[0]
df_subj['PID'] = df_general['PID_ORBIS'][0]
df_subj['Gender'] = df_general['Gender'][0]
df_subj['Diagnosis_preop'] = df_general['diagnosis'][0]
# Now extract teh changed data from the GUI
df_subj["First_Diagnosed_preop"] = self.lineEditFirstDiagnosed.text()
df_subj['Admission_preop'] = self.lineEditAdmNeurIndCheck.text()
df_subj['Dismissal_preop'] = self.DismNeurIndCheckLabel.text()
.
.
.
subj_id = General.read_current_subj().id[0] # reads data from current_subj (saved in ./tmp)
df = General.import_dataframe('{}.csv'.format(self.date), separator_csv=',')
if df.shape[1] == 1:
df = General.import_dataframe('{}.csv'.format(self.date), separator_csv=';')
idx2replace = df.index[df['ID'] == subj_id][0]
df_subj = df.iloc[idx2replace, :]
df = df.replace(['nan', ''], [np.nan, np.nan])
#df.to_csv(os.path.join(FILEDIR, "preoperative.csv"), index=False)
If i run this code including the line
"df.to_csv(os.path.join(FILEDIR, "preoperative.csv"), index=False)", all data in the csv file (except the columns "ID", "PID", "Gender" "Diagnosis_preop")
So I want all the data i definied in df_subj to replace the data in csv.
I have no idea why my code isn't working ..
Thank you guys!
Related
I am building a table that updates the values of an output DF into a csv file (or whatever output defined).
I defined a generate_agrid(df) function that outputs a class that contains a data method that is a pd.DataFrame. When I run the code grid_table = generate_agrid(df), the grid_table generated contains the original df, even if I modify it in the UI. I noticed that when I checked the input that my update function received.
What I want is to:
Graph the data in df -> update DF data in the UI and return -> save new df data into a csv every time I press update button
Why does my generate_agrid method always returns the initial DF used as an input? How can i update it?
My code
import streamlit as st
from metrics.get_metrics import get_data
from metrics.config import PATH_SAMPLES
filename: str = 'updated_sample.csv'
save_path = PATH_SAMPLES.joinpath(filename)
def generate_agrid(data: pd.DataFrame):
gb = GridOptionsBuilder.from_dataframe(data)
gb.configure_default_column(editable=True) # Make columns editable
gb.configure_pagination(paginationAutoPageSize=True) # Add pagination
gb.configure_side_bar() # Add a sidebar
gb.configure_selection('multiple', use_checkbox=True,
groupSelectsChildren="Group checkbox select children") # Enable multi-row selection
gridOptions = gb.build()
grid_response = AgGrid(
data,
gridOptions=gridOptions,
data_return_mode=DataReturnMode.AS_INPUT,
update_on='MANUAL', # <- Should it let me update before returning?
fit_columns_on_grid_load=False,
theme=AgGridTheme.STREAMLIT, # Add theme color to the table
enable_enterprise_modules=True,
height=350,
width='100%',
reload_data=True
)
data = grid_response['data']
selected = grid_response['selected_rows']
df = pd.DataFrame(selected) # Pass the selected rows to a new dataframe df
return grid_response
def update(grid_table: classmethod, filename: str = 'updated_sample.csv'):
save_path = PATH_SAMPLES.joinpath(filename)
grid_table_df = pd.DataFrame(grid_table['data'])
grid_table_df.to_csv(save_path, index=False)
# First data gather
df = get_data()
if __name__ == '__main__':
# Start graphing
grid_table = generate_agrid(df)
# Update
st.sidebar.button("Update", on_click=update, args=[grid_table])
Found the issue, it was just a small parameter that was activated.
While instantiating the AgGrid, I had to eliminate the reload_data=True parameter. Doing that, everything worked as expected and the data could be successfully updated after manually inputting and pressing "update"
This is how AgGrid must be instantiated
grid_response = AgGrid(
data,
gridOptions=gridOptions,
data_return_mode=DataReturnMode.AS_INPUT,
update_on='MANUAL',
fit_columns_on_grid_load=False,
theme=AgGridTheme.STREAMLIT, # Add theme color to the table
enable_enterprise_modules=True,
height=350,
width='100%',
)
I am new to python and currently learning this language. I am trying to build a web scraper that will export the data to a CSV. I have the data I want and downloaded it to a CSV. The problem is that I have only managed to dump the data from one index and I want to dump all the data from all the indexes into the same CSV to form a database.
The problem I have is that I can only request n_companies indicating the index. For example (n_company[0] ) and I get the data from the first index of the list. What I want is to get all the data from all the indexes in the same function and then dump them with pandas in a CSV and thus be able to create a DB.
I'm stuck at this point and don't know how to proceed. Can you help me please.
This is the function:
def datos_directorio(n_empresa):
r = session.get(n_empresa[0])
home=r.content.decode('UTF-8')
tree=html.fromstring(home)
descripcion_direccion_empresas = '//p[#class = "paragraph"][2]//text()[normalize-space()]'
nombre_e = '//h1[#class ="mb3 h0 bold"][normalize-space()]/text()'
email = '//div[#class = "inline-block mb1 mr1"][3]/a[#class = "mail button button-inverted h4"]/text()[normalize-space()]'
teléfono = '//div[#class = "inline-block mb1 mr1"][2]/a[#class = "tel button button-inverted h4"]/text()[normalize-space()]'
d_empresas=tree.xpath(descripcion_direccion_empresas)
d_empresas = " ".join(d_empresas)
empresas_n=tree.xpath(nombre_e)
empresas_n = " ".join(empresas_n[0].split())
email_e=tree.xpath(email)
email_e = " ".join(email_e[0].split())
teléfono_e=tree.xpath(teléfono)
teléfono_e = " ".join(teléfono_e[0].split())
contenido = {
'EMPRESA' : empresas_n,
'EMAIL' : email_e,
'TELÉFONO' : teléfono_e,
'CONTACTO Y DIRECCIÓN' : d_empresas
}
return contenido
Best regards.
I'm working on pulling data from a public API and converting the response JSON file to a Pandas Dataframe. I've written the code to pull the data and gotten a successful JSON response. The issue I'm having is parsing through the file and converting the data to a dataframe. Whenever I run through my for loop, I get a dataframe that retruns 1 row when it should be returning approximately 2500 rows & 6 columns. I've copied and pasted my code below:
Things to note:
I've commented out my api key with "api_key".
I'm new(ish) to python so I understand that my code formatting might not be best practices. I'm open to changes.
Here is the link to the API that I am requesting from: https://developer.va.gov/explore/facilities/docs/facilities?version=current
facilities_data = pd.DataFrame(columns=['geometry_type', 'geometry_coordinates', 'id', 'facility_name', 'facility_type','facility_classification'])
# function that will make the api call and sort through the json data
def get_facilities_data(facilities_data):
# Make API Call
res = requests.get('https://sandboxapi.va.gov/services/va_facilities/v0/facilities/all',headers={'apikey': 'api_key'})
data = json.loads(res.content.decode('utf-8'))
time.sleep(1)
for facility in data['features']:
geometry_type = data['features'][0]['geometry']['type']
geometry_coordinates = data['features'][0]['geometry']['coordinates']
facility_id = data['features'][0]['properties']['id']
facility_name = data['features'][0]['properties']['name']
facility_type = data['features'][0]['properties']['facility_type']
facility_classification = data['features'][0]['properties']['classification']
# Save data into pandas dataframe
facilities_data = facilities_data.append(
{'geometry_type': geometry_type, 'geometry_coordinates': geometry_coordinates,
'facility_id': facility_id, 'facility_name': facility_name, 'facility_type': facility_type,
'facility_classification': facility_classification}, ignore_index=True)
return facilities_data
facilities_data = get_facilities_data(facilities_data)
print(facilities_data)```
As mentioned, you should
loop over facility instead of data['features'][0]
append within the loop
This will get you the result you are after.
facilities_data = pd.DataFrame(columns=['geometry_type', 'geometry_coordinates', 'id', 'facility_name', 'facility_type','facility_classification'])
def get_facilities_data(facilities_data):
# Make API Call
res = requests.get("https://sandbox-api.va.gov/services/va_facilities/v0/facilities/all",
headers={"apikey": "REDACTED"})
data = json.loads(res.content.decode('utf-8'))
time.sleep(1)
for facility in data['features']:
geometry_type = facility['geometry']['type']
geometry_coordinates = facility['geometry']['coordinates']
facility_id = facility['properties']['id']
facility_name = facility['properties']['name']
facility_type = facility['properties']['facility_type']
facility_classification = facility['properties']['classification']
# Save data into pandas dataframe
facilities_data = facilities_data.append(
{'geometry_type': geometry_type, 'geometry_coordinates': geometry_coordinates,
'facility_id': facility_id, 'facility_name': facility_name, 'facility_type': facility_type,
'facility_classification': facility_classification}, ignore_index=True)
return facilities_data
facilities_data = get_facilities_data(facilities_data)
print(facilities_data.head())
There are some more things we can improve upon;
json() can be called directly on requests output
time.sleep() is not needed
appending to a DataFrame on each iteration is discouraged; we can collect the data in another way and create the DataFrame afterwards.
Implementing these improvements results in;
def get_facilities_data():
data = requests.get("https://sandbox-api.va.gov/services/va_facilities/v0/facilities/all",
headers={"apikey": "REDACTED"}).json()
facilities_data = []
for facility in data["features"]:
facility_data = (facility["geometry"]["type"],
facility["geometry"]["coordinates"],
facility["properties"]["id"],
facility["properties"]["name"],
facility["properties"]["facility_type"],
facility["properties"]["classification"])
facilities_data.append(facility_data)
facilities_df = pd.DataFrame(data=facilities_data,
columns=["geometry_type", "geometry_coords", "id", "name", "type", "classification"])
return facilities_df
I have a python code that loops through multiple location and pulls data from a third part API. Below is the code sublocation_idsare location id coming from a directory.
As you can see from the code the data gets converted to a data frame and then saved to a Excel file. The current issue I am facing is if the API does not returns data for publication_timestamp for certain location the loop stops and does not proceeds and I get error as shown below the code.
How do I avoid this and skip to another loop if no data is returned by the API?
for sub in sublocation_ids:
city_num_int = sub['id']
city_num_str = str(city_num_int)
city_name = sub['name']
filter_text_new = filter_text.format(city_num_str)
data = json.dumps({"filters": [filter_text_new], "sort_by":"created_at", "size":2})
r = requests.post(url = api_endpoint, data = data).json()
articles_list = r["articles"]
articles_list_normalized = json_normalize(articles_list)
df = articles_list_normalized
df['publication_timestamp'] = pd.to_datetime(df['publication_timestamp'])
df['publication_timestamp'] = df['publication_timestamp'].apply(lambda x: x.now().strftime('%Y-%m-%d'))
df.to_excel(writer, sheet_name = city_name)
writer.save()
Key Error: publication_timestamp
Change this bit of code:
df = articles_list_normalized
if 'publication_timestamp' in df.columns:
df['publication_timestamp'] = pd.to_datetime(df['publication_timestamp'])
df['publication_timestamp'] = df['publication_timestamp'].apply(lambda x: x.now().strftime('%Y-%m-%d'))
df.to_excel(writer, sheet_name = city_name)
else:
continue
If the API literally returns no data i.e. {} then you might even do the check before normalizing it:
if articles_list:
df = json_normalize(articles_list)
# ... rest of code ...
else:
continue
I'm trying to manipulate some .sav files with SavReaderWriter. What I already have is this:
with savReaderWriter.SavReader(dirIn, ioUtf8 = True) as reader:
df = pd.DataFrame(reader.all(), columns = [s for s in reader.header])
varLabels = reader.varLabels
varTypes = reader.varTypes
valueLabels = reader.valueLabels
varWidth = reader.varWids # <------------- This guy
varMeasure = reader.measureLevels
varAlignments = reader.alignments
varColumnWidths = reader.columnWidths
varMissingValues = reader.missingValues
and:
with SavWriter(savFileName = dirOut,
varNames = varNames,
varTypes = varTypes,
varLabels= varLabels,
valueLabels = valueLabels,
measureLevels = varMeasure,
columnWidths = varColumnWidths,
alignments = varAlignments,
missingValues = varMissingValues,
ioUtf8=True
) as writer:
for record in records:
writer.writerow(record)
The problem is that I don't know how can I set the Variable Width that I got when reading the sav at fist code, when using the SavWriter part. Does anyone else know what can I do?
I acctually got it working!
First I had to get the formats when reading the .sav:
varFormats = reader.formats
Then just add this param when opening the savWriter:
formats = varFormats
Kind made my way since the docs doesnt help that much, but gave me an idea how the formats works:
https://pythonhosted.org/savReaderWriter/index.html#formats