Failed to read data from Bigtable using Google's canonical example - python

I'm following Google's advice for writing a single row of data to Bigtable and then reading it back again.
Hence my code looks like this:
import datetime
from google.cloud import bigtable
def write_simple(project_id, instance_id, table_id):
client = bigtable.Client(project=project_id, admin=True)
instance = client.instance(instance_id)
table = instance.table(table_id)
timestamp = datetime.datetime.utcnow()
column_family_id = "stats_summary"
row_key = "phone#4c410523#20190501"
row = table.direct_row(row_key)
row.set_cell(column_family_id,
"connected_cell",
1,
timestamp)
row.set_cell(column_family_id,
"connected_wifi",
1,
timestamp)
row.set_cell(column_family_id,
"os_build",
"PQ2A.190405.003",
timestamp)
row.commit()
print('Successfully wrote row {}.'.format(row_key))
def read_row(project_id, instance_id, table_id):
client = bigtable.Client(project=project_id, admin=True)
instance = client.instance(instance_id)
table = instance.table(table_id)
row_key = "phone#4c410523#20190501"
row = table.read_row(row_key)
print(row)
def print_row(row):
print("Reading data for {}:".format(row.row_key.decode('utf-8')))
for cf, cols in sorted(row.cells.items()):
print("Column Family {}".format(cf))
for col, cells in sorted(cols.items()):
for cell in cells:
labels = " [{}]".format(",".join(cell.labels)) \
if len(cell.labels) else ""
print(
"\t{}: {} #{}{}".format(col.decode('utf-8'),
cell.value.decode('utf-8'),
cell.timestamp, labels))
print("")
write_simple(
project_id="msm-groupdata-datalake-dev",
instance_id="jamiet-dp-tf-instance",
table_id="user-agent")
read_row(
project_id="myproject",
instance_id="myinstance",
table_id="mytable")
When I run it, this is the output I get:
Successfully wrote row phone#4c410523#20190501.
None
Its the None that is bothering me. Given I am reading/writing the same row_key I would expect to get a row back, but it seems I am not and I don't know why. Can anyone advise?

Looking at your code I think that the issue of why when you read the response is None is because you are not actually writing in your table.
You need to check what is your column_family_id so you can write the information in the table that you created, for example I made a table for testing and my column_family_id was nf1 and not stats_summary.
Best regards.

Related

how to check item is exists and renew value only in python gspread

How to check item is exists and renew value only, if not exists and add new one?
For example:
I have an item 1 and value 1 already in my sheet, and then I get new value of item 1, I want to renew value 1 only, otherwise, if I get new item 2 and value 2, I want to add this in new columns.
I don't know how to write code, I search it long time but cannot found, could anyone help me? Many thanks!
The script below, the steps are:
first step, check my gmail get keyword 1
second, use keyword search datas in website (beautifulsoup module)
the last step, upload datas to google sheet (gspread module)
def Check_emailbox(box='Inbox', lab='SUBJECT', title='[PASS]'):
global email_content, report_info1, my_msg, report_info
dirpath = 'XXX'
with open(dirpath) as act:
content = act.read()
my_act = yaml.load(content, Loader=yaml.FullLoader)
user, password = my_act['user'], my_act['password']
imapUrl = 'imap.gmail.com'
my_mail = imaplib.IMAP4_SSL(imapUrl)
my_mail.login(user, password)
print('Login gmail account seccess.')
my_mail.select(box)
key = lab
value = title
_, data = my_mail.search(None, key, value)
mail_id_list = data[0].split()
msg_id = mail_id_list[-1]
res, data = my_mail.fetch(msg_id, '(RFC822)')
report_info = []
if res == 'OK':
raw_msg_txt = data[0][1]
try:
my_msg = email.message_from_bytes(raw_msg_txt)
print('Subject: ', my_msg['subject'])
print('From: ', my_msg['from'])
print('Time: ', my_msg['date'])
print('------------------------------------------------------------------------------------')
print('Content:')
for part in my_msg.walk():
email_content = part.get_payload()
report_info.append(email_content)
report_info1 = ''.join('%s' % id for id in report_info)
print(report_info1, type(report_info1))
# print('Hide info, if want to see detail, unmark previous code')
print('------------------------------------------------------------------------------------')
# my_mail.store(msg_id, '-FLAGS', '\SEEN')
except AttributeError:
my_msg = email.message_from_string(raw_msg_txt)
print('AttributeError: ', my_msg)
return email_content, my_msg, report_info, report_info1
Check_emailbox()
keyName = re.findall(r'Daily Report : (.*?)$', report_info1)
fwName = ''.join(keyName)
print(fwName)
# ↑ This data will be upload to sheet, and this is main item for check:
# if "feName" is exists, renew below datas only, if not exists, add new one in next row.
fwVersion = ''.join(re.findall(r'\d-(.*?)-', fwName)).rsplit('.',1)[0]
print(fwVersion)
# connect to the website and use beautifulsoup
ele = requests.get('XXXXXX')
felement = BeautifulSoup(ele.text, 'html.parser')
# print(felement.prettify())
fwinfo = felement.find(['a'], text = fwName)
fwhref = fwinfo.get('href')
print('Info: ', fwinfo)
print(fwhref)
rowid = ''.join(re.findall(r'data/(.*?)$', fwhref))
print('Download id is: ', rowid)
fwlink = 'XXXXXXXXX' + rowid
print('Download link: ', fwlink)
json_key = "XXXXXXX"
spread_url = ['https://spreadsheets.google.com/feeds']
connect_auth = SAC.from_json_keyfile_name(json_key, spread_url)
google_sheets = gspread.authorize(connect_auth)
sheet = google_sheets.open_by_key('XXXXXXXXX').worksheet('Pass Data')
Sheets = sheet
upload = []
upload.append(fwName)
upload.append(fwVersion)
upload.append(rowid)
upload.append(fwlink)
Sheets.append_row(upload)
print('==== Uplod to Google Sheet Done. ====')
In your situation, how about the following modification?
Modified script:
In this case, please use your google_sheets.
# Please set your values here.
fwName = "###"
fwVersion = "###"
rowid = "###"
fwlink = "###"
sheet = google_sheets.open_by_key('XXXXXXXXX').worksheet("Pass Data")
values = sheet.get_all_values()[2:]
obj = {}
for i, r in enumerate(values):
obj[r[0]] = i + 3
if obj.get(fwName):
sheet.update("B" + str(obj.get(fwName)), [[fwVersion, rowid, fwlink]], value_input_option="USER_ENTERED")
When this script is run, first, the values are retrieve from the sheet. And, by searching the value of column "A", new value is put to the searched row.
Note:
I prepared this modified script using your sample image. In your sample image, the 1st 2 rows are header rows. And, the search column is the column "A". I used them. So, when you change your Spreadsheet, this script might not be able to be used. Please be careful about this.
References:
update(range_name, values=None, **kwargs)
get_all_values(**kwargs)

Can't make apache beam write outputs to bigquery when using DataflowRunner

I'm trying to understand why this pipeline writes no output to BigQuery.
What I'm trying to achieve is to calculate the USD index for the last 10 years, starting from different currency pairs observations.
All the data is in BigQuery and I need to organize it and sort it in a chronollogical way (if there is a better way to achieve this, I'm glad to read it because I think this might not be the optimal way to do this).
The idea behing the class Currencies() is to start grouping (and keep) the last observation of a currency pair (eg: EURUSD), update all currency pair values as they "arrive", sort them chronologically and finally get the open, high, low and close value of the USD index for that day.
This code works in my jupyter notebook and in cloud shell using DirectRunner, but when I use DataflowRunner it does not write any output. In order to see if I could figure it out, I tried to just create the data using beam.Create() and then write it to BigQuery (which it worked) and also just read something from BQ and write it on other table (also worked), so my best guess is that the problem is in the beam.CombineGlobally part, but I don't know what it is.
The code is as follows:
import logging
import collections
import apache_beam as beam
from datetime import datetime
SYMBOLS = ['usdjpy', 'usdcad', 'usdchf', 'eurusd', 'audusd', 'nzdusd', 'gbpusd']
TABLE_SCHEMA = "date:DATETIME,index:STRING,open:FLOAT,high:FLOAT,low:FLOAT,close:FLOAT"
class Currencies(beam.CombineFn):
def create_accumulator(self):
return {}
def add_input(self,accumulator,inputs):
logging.info(inputs)
date,currency,bid = inputs.values()
if '.' not in date:
date = date+'.0'
date = datetime.strptime(date,'%Y-%m-%dT%H:%M:%S.%f')
data = currency+':'+str(bid)
accumulator[date] = [data]
return accumulator
def merge_accumulators(self,accumulators):
merged = {}
for accum in accumulators:
ordered_data = collections.OrderedDict(sorted(accum.items()))
prev_date = None
for date,date_data in ordered_data.items():
if date not in merged:
merged[date] = {}
if prev_date is None:
prev_date = date
else:
prev_data = merged[prev_date]
merged[date].update(prev_data)
prev_date = date
for data in date_data:
currency,bid = data.split(':')
bid = float(bid)
currency = currency.lower()
merged[date].update({
currency:bid
})
return merged
def calculate_index_value(self,data):
return data['usdjpy']*data['usdcad']*data['usdchf']/(data['eurusd']*data['audusd']*data['nzdusd']*data['gbpusd'])
def extract_output(self,accumulator):
ordered = collections.OrderedDict(sorted(accumulator.items()))
index = {}
for dt,currencies in ordered.items():
if not all([symbol in currencies.keys() for symbol in SYMBOLS]):
continue
date = str(dt.date())
index_value = self.calculate_index_value(currencies)
if date not in index:
index[date] = {
'date':date,
'index':'usd',
'open':index_value,
'high':index_value,
'low':index_value,
'close':index_value
}
else:
max_value = max(index_value,index[date]['high'])
min_value = min(index_value,index[date]['low'])
close_value = index_value
index[date].update({
'high':max_value,
'low':min_value,
'close':close_value
})
return index
def main():
query = """
select date,currency,bid from data_table
where date(date) between '2022-01-13' and '2022-01-16'
and currency like ('%USD%')
"""
options = beam.options.pipeline_options.PipelineOptions(
temp_location = 'gs://PROJECT/temp',
project = 'PROJECT',
runner = 'DataflowRunner',
region = 'REGION',
num_workers = 1,
max_num_workers = 1,
machine_type = 'n1-standard-1',
save_main_session = True,
staging_location = 'gs://PROJECT/stag'
)
with beam.Pipeline(options = options) as pipeline:
inputs = (pipeline
| 'Read From BQ' >> beam.io.ReadFromBigQuery(query=query,use_standard_sql=True)
| 'Accumulate' >> beam.CombineGlobally(Currencies())
| 'Flat' >> beam.ParDo(lambda x: x.values())
| beam.io.Write(beam.io.WriteToBigQuery(
table = 'TABLE',
dataset = 'DATASET',
project = 'PROJECT',
schema = TABLE_SCHEMA))
)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
main()
They way I execute this is from shell, using python3 -m first_script (is this the way I should run this batch jobs?).
What I'm missing or doing wrong? This is my first attemp to use Dataflow, so i'm probably making several mistakes in the book.
For whom it may help: I faced a similar problem but I already used the same code for a different flow that had a pubsub as input where it worked flawless instead a file based input where it simply did not. After a lot of experimenting I found that in the options I changed the flag
options = PipelineOptions(streaming=True, ..
to
options = PipelineOptions(streaming=False,
as of course it is not a streaming source, it's a bounded source, a batch. After I set this flag to true I found my rows in the BigQuery table. After it had finished it even stopped the pipeline as it where a batch operation. Hope this helps

Airflow Pipeline to read CSVs and load into PostgreSQL

So, I am trying to write an airflow Dag to 1) Read a few different CSVs from my local desk, 2) Create different PostgresQL tables, 3) Load the files into their respective tables. When I am running the DAG, the second step seems to fail.
Below are the DAG logic operators code:
AIRFLOW_HOME = os.getenv('AIRFLOW_HOME')
def get_listings_data ():
listings = pd.read_csv(AIRFLOW_HOME + '/dags/data/listings.csv')
return listings
def get_g01_data ():
demographics= pd.read_csv(AIRFLOW_HOME + '/dags/data/demographics.csv')
return demographics
def insert_listing_data_func(**kwargs):
ps_pg_hook = PostgresHook(postgres_conn_id="postgres")
conn_ps = ps_pg_hook.get_conn()
ti = kwargs['ti']
insert_df = pd.DataFrame.listings
if len(insert_df) > 0:
col_names = ['host_id', 'host_name', 'host_neighbourhood', 'host_total_listings_count', 'neighbourhood_cleansed', 'property_type', 'price', 'has_availability', 'availability_30']
values = insert_df[col_names].to_dict('split')
values = values['data']
logging.info(values)
insert_sql = """
INSERT INTO assignment_2.listings (host_name, host_neighbourhood, host_total_listings_count, neighbourhood_cleansed, property_type, price, has_availability, availability_30)
VALUES %s
"""
result = execute_values(conn_ps.cursor(), insert_sql, values, page_size=len(insert_df))
conn_ps.commit()
else:
None
return None
def insert_demographics_data_func(**kwargs):
ps_pg_hook = PostgresHook(postgres_conn_id="postgres")
conn_ps = ps_pg_hook.get_conn()
ti = kwargs['ti']
insert_df = pd.DataFrame.demographics
if len(insert_df) > 0:
col_names = ['LGA', 'Median_age_persons', 'Median_mortgage_repay_monthly', 'Median_tot_prsnl_inc_weekly', 'Median_rent_weekly', 'Median_tot_fam_inc_weekly', 'Average_num_psns_per_bedroom', 'Median_tot_hhd_inc_weekly', 'Average_household_size']
values = insert_df[col_names].to_dict('split')
values = values['data']
logging.info(values)
insert_sql = """
INSERT INTO assignment_2.demographics (LGA,Median_age_persons,Median_mortgage_repay_monthly,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Average_num_psns_per_bedroom,Median_tot_hhd_inc_weekly,Average_household_size)
VALUES %s
"""
result = execute_values(conn_ps.cursor(), insert_sql, values, page_size=len(insert_df))
conn_ps.commit()
else:
None
return None
And my postgresQL hook for the demographics table (just an example) is below:
create_psql_table_demographics= PostgresOperator(
task_id="create_psql_table_demographics",
postgres_conn_id="postgres",
sql="""
CREATE TABLE IF NOT EXISTS postgres.demographics (
LGA VARCHAR,
Median_age_persons INT,
Median_mortgage_repay_monthly INT,
Median_tot_prsnl_inc_weekly INT,
Median_rent_weekly INT,
Median_tot_fam_inc_weekly INT,
Average_num_psns_per_bedroom DECIMAL(10,1),
Median_tot_hhd_inc_weekly INT,
Average_household_size DECIMAL(10,2)
);
""",
dag=dag)
Am I missing something in my code that stops the completion of that create_psql_table_demographics from running successfully on Airflow?
If your Postgresql database has access to the CSV files,
you may simply use the copy_expert method of the PostgresHook class (cf documentation).
Postgresql is pretty efficient in loading flat files: you'll save a lot of cpu cycles by not involving python (and Pandas!), not to mention the potential encoding issues that you would have to address.

Optimization of code written using python and BAPI

I have a python code here which goes into SAP using BAPI RFC_READ_TABLE, queries USR02 table and bring back the results. The input is taken from an excel sheet A column and the output is pasted in B column
The code is running all fine. However, for 1000 records, it is taking 8 minutes approximately to run.
Can you please help in optimizing the code? I am really new at python, managed to write this heavy code but now stuck at the optimization part.
It would be really great if this can run in 1-2 minutes max.
from pyrfc import Connection, ABAPApplicationError, ABAPRuntimeError, LogonError, CommunicationError
from configparser import ConfigParser
from pprint import PrettyPrinter
import openpyxl
ASHOST='***'
CLIENT='***'
SYSNR='***'
USER='***'
PASSWD='***'
conn = Connection(ashost=ASHOST, sysnr=SYSNR, client=CLIENT, user=USER, passwd=PASSWD)
try:
wb = openpyxl.load_workbook('new2.xlsx')
ws = wb['Sheet1']
for i in range(1,len(ws['A'])+1):
x = ws['A'+ str(i)].value
options = [{ 'TEXT': "BNAME = '" +x+"'"}]
fields = [{'FIELDNAME': 'CLASS'},{'FIELDNAME':'USTYP'}]
pp = PrettyPrinter(indent=4)
ROWS_AT_A_TIME = 10
rowskips = 0
while True:
result = conn.call('RFC_READ_TABLE', \
QUERY_TABLE = 'USR02', \
OPTIONS = options, \
FIELDS = fields, \
ROWSKIPS = rowskips, ROWCOUNT = ROWS_AT_A_TIME)
rowskips += ROWS_AT_A_TIME
if len(result['DATA']) < ROWS_AT_A_TIME:
break
data_result = result['DATA']
length_result = len(data_result)
for line in range(0,length_result):
a= data_result[line]["WA"].strip()
wb = openpyxl.load_workbook('new2.xlsx')
ws = wb['Sheet1']
ws['B'+str(i)].value = a
wb.save('new2.xlsx')
except CommunicationError:
print("Could not connect to server.")
raise
except LogonError:
print("Could not log in. Wrong credentials?")
raise
except (ABAPApplicationError, ABAPRuntimeError):
print("An error occurred.")
raise
EDIT :
So here is my updated code. For now, I have decided to output the data on command line only. Output shows where is the time taken.
try:
output_list = []
wb = openpyxl.load_workbook('new3.xlsx')
ws = wb['Sheet1']
col = ws['A']
col_lis = [col[x].value for x in range(len(col))]
length = len(col_lis)
for i in range(length):
print("--- %s seconds Start of the loop ---" % (time.time() - start_time))
x = col_lis[i]
options = [{ 'TEXT': "BNAME = '" + x +"'"}]
fields = [{'FIELDNAME': 'CLASS'},{'FIELDNAME':'USTYP'}]
ROWS_AT_A_TIME = 10
rowskips = 0
while True:
result = conn.call('RFC_READ_TABLE', QUERY_TABLE = 'USR02', OPTIONS = options, FIELDS = fields, ROWSKIPS = rowskips, ROWCOUNT = ROWS_AT_A_TIME)
rowskips += ROWS_AT_A_TIME
if len(result['DATA']) < ROWS_AT_A_TIME:
break
print("--- %s seconds in SAP ---" % (time.time() - start_time))
data_result = result['DATA']
length_result = len(data_result)
for line in range(0,length_result):
a= data_result[line]["WA"]
output_list.append(a)
print(output_list)
Firstly I put timing mark at different places of code having divided it into functional sections (SAP processing, Excel processing).
Upon analyzing the timings I found that the most runtime is consumed by Excel writing code,
consider the intervals:
16:52:37.306272
16:52:37.405006 moment it was fetched from SAP
16:52:37.552611 moment it was pushed to Excel
16:52:37.558631
16:52:37.634395 moment it was fetched from SAP
16:52:37.796002 moment it was pushed to Excel
16:52:37.806930
16:52:37.883724 moment it was fetched from SAP
16:52:38.060254 moment it was pushed to Excel
16:52:38.067235
16:52:38.148098 moment it was fetched from SAP
16:52:38.293669 moment it was pushed to Excel
16:52:38.304640
16:52:38.374453 moment it was fetched from SAP
16:52:38.535054 moment it was pushed to Excel
16:52:38.542004
16:52:38.618800 moment it was fetched from SAP
16:52:38.782363 moment it was pushed to Excel
16:52:38.792336
16:52:38.873119 moment it was fetched from SAP
16:52:39.034687 moment it was pushed to Excel
16:52:39.040712
16:52:39.114517 moment it was fetched from SAP
16:52:39.264716 moment it was pushed to Excel
16:52:39.275649
16:52:39.346005 moment it was fetched from SAP
16:52:39.523721 moment it was pushed to Excel
16:52:39.530741
16:52:39.610487 moment it was fetched from SAP
16:52:39.760086 moment it was pushed to Excel
16:52:39.771057
16:52:39.839873 moment it was fetched from SAP
16:52:40.024574 moment it was pushed to Excel
as you can see the Excel writing part is much as twice as SAP querying part.
What is wrong in your code is that you open/initizalizing the workbook and sheet in each loop iteration, this slows execution a lot and is redundant as you can reuse the wrokbook variables from the top.
Another redundant thing is stripping leading and trailing zeroes, it is quite of redundant as Excel do this automatically for string data.
This variant of code
try:
wb = openpyxl.load_workbook('new2.xlsx')
ws = wb['Sheet1']
print(datetime.now().time())
for i in range(1,len(ws['A'])+1):
x = ws['A'+ str(i)].value
options = [{ 'TEXT': "BNAME = '" + x +"'"}]
fields = [{'FIELDNAME': 'CLASS'},{'FIELDNAME':'USTYP'}]
ROWS_AT_A_TIME = 10
rowskips = 0
while True:
result = conn.call('RFC_READ_TABLE', QUERY_TABLE = 'USR02', OPTIONS = options, FIELDS = fields, ROWSKIPS = rowskips, ROWCOUNT = ROWS_AT_A_TIME)
rowskips += ROWS_AT_A_TIME
if len(result['DATA']) < ROWS_AT_A_TIME:
break
data_result = result['DATA']
length_result = len(data_result)
for line in range(0,length_result):
ws['B'+str(i)].value = data_result[line]["WA"]
wb.save('new2.xlsx')
print(datetime.now().time())
except ...
gives me following timestamps of program run:
>>> exec(open('RFC_READ_TABLE.py').read())
18:14:03.003174
18:16:29.014373
2.5 minutes for 1000 user records, which looks a fair price for this kind of processing.
In my opinion, the problem is in the while True loop. I think you need to optimize your query logic (or change it). It is hard without knowing what you are interested in the DB, The other things looking easy and fast.
Something that could help is to try to not open and close the file continuously: try to compute your "B" column and then open and paste all at once in the xlsx file. It could help (but i'm pretty sure that is the query the problem)
P.S. Maybe you can use some timing library (like here) to compute WHERE you spend most of the time.

export SQL to Excel with Pandas, how to make a drop down?

I have a simple MSSQL table with data in it like this:
IF OBJECT_ID('MY_BIG_TABLE') IS NOT NULL
DROP TABLE [dbo].[MY_BIG_TABLE]
CREATE TABLE [dbo].[MY_BIG_TABLE](
[ID] [int] NOT NULL,
[PERSON] [varchar](50) NOT NULL,
[STREET] [varchar](50) NOT NULL,
[PHONE] [varchar](50) NOT NULL,
[BATCH_ID] [int] NOT NULL
) ON [PRIMARY]
GO
INSERT INTO MY_BIG_TABLE (ID, PERSON, STREET, PHONE, BATCH_ID)
VALUES
(1, 'BOB', 'MAIN STREET', '555-555-5555', 100),
(2, 'SANDY', 'ELM', '666-555-5555', 100),
(3, 'FRED', 'PINE', '777-555-5555', 200),
(8, 'BOB', 'DIRT', '888-555-5555', 200),
(52, 'GEORGE', 'RIVER ROAD', '999-555-5555', 700)
I'm exporting data out of it into Excel files using Python and Pandas like this:
import pypyodbc
import pandas
def main():
server_name = "SERVER_NAME"
database = "TEST_DATABASE"
connection = pypyodbc.connect(
"Driver={SQL Server};"
"Server=" + server_name + ";"
"Database=" + database + ";"
"Trusted_Connection=yes;"
)
batch_id_list = [100,200,700]
for batch_id in batch_id_list:
print ("--------------------------------------------")
print ("".join(["reading batch_id:", str(batch_id)]))
file_name = "".join(["EXPORT_", str(batch_id), ".xlsx"])
the_sql = """
SELECT * FROM
MY_BIG_TABLE
WHERE BATCH_ID = ?"""
data = pandas.read_sql(the_sql, connection, params=[batch_id])
print ("".join(["writing batch_id:", str(batch_id)]))
data.to_excel("".join(["c:/temp/", file_name]))
if __name__ == "__main__":
main()
And I get a nice little pile of Excel files. One file per BATCH_ID with all of those results loaded in. Works great. What I need to do is have one of the columns be an Excel drop down like this:
I could certainly go into Excel and make it for each file, but as I'm sure you can tell this is just sample data. I'm going to be making thousands of Excel files.
How can I using what I have in Python make one of the columns a drop down like what I have in the picture? Is there some kind of template option I can take advantage of, I'm open to anything. I have control over the SQL data so I can add values if that helps make it easier. Thanks in advance!
I was not able to use Pandas to add in the drop down, but I was able read the file back in, update it, and write it back out like this:
from openpyxl.worksheet.datavalidation import DataValidation
from openpyxl import load_workbook
def add_drop_down(file_path, file_name, row_total):
print("adding drop down")
wb = load_workbook("".join([file_path, file_name]))
ws = wb['Sheet1']
ws['S2'] = 'Yes'
ws['S3'] = 'No'
ws['S4'] = 'Maybe'
ws['S5'] = 'OK'
ws['S6'] = 'Not OK'
ws['S7'] = 'Check'
ws['T2'] = 'What1'
ws['T3'] = 'What2'
ws['T4'] = 'What3'
current_row = 2
while current_row < row_total + 2:
data_val_results = DataValidation(type="list",formula1='=S2:S7')
data_val_status = DataValidation(type="list",formula1='=T2:T4')
ws.add_data_validation(data_val_results)
ws.add_data_validation(data_val_status)
row_results = "".join(["O", str(current_row)])
row_status = "".join(["P", str(current_row)])
data_val_results.add(ws[row_results])
data_val_status.add(ws[row_status])
current_row += 1
wb.save("".join([file_path, file_name]))
There is probably a more clever way to do this but for one shot export and shipping it, this works great! In this example the drop downs go all the way down the sheet up to a specified row number. I also put two different drop downs in this example. Thanks for pointing me in the right direction guys!

Categories