I am trying to load the 8m[rows] * 1k[columns] python dataframe into Mongo. For the performance improvement I have planned to use Mongo bulk operation.There will be update also I have to do in the collections on daily basis so that I used the upsert method of bulk operation. Below is the code which I have prepared,
def BulkLoad(self, Dataset):
counter = 0;
empty = '000'
columns = []
records = []
DataDict = {}
for col in Dataset.columns:
columns.append(col)
try:
db = self.getDatabase()
bulk = db.collection.initialize_ordered_bulk_op()
for j in range(len(Dataset)):
records.clear()
DataDict.clear()
DataDict.update(
{'CreatedBy': empty, 'ModifiedBy': empty, 'Value': Score})
for column in columns:
colValue = str(Dataset[column][j])
if (colValue == 'nan'):
colValue = colValue.replace('nan', '')
DataDict.update({column: colValue})
records.append(DataDict)
print("list is ",records)
Id = DataDict['Id']
Number = DataDict['Number']
print(DataDict)
bulk.find(
{'Id': Id, 'Number': Number}).upsert().update(
{
'$set': {'Id': Id, 'Number': Number,'Events':records}
})
counter += 1
if counter % 1000 == 0:
result = bulk.execute()
logging.info(pprint(result))
bulk = db.coll.initialize_ordered_bulk_op()
if counter % 1000 != 0:
result = bulk.execute()
logging.info(pprint(result))
except Exception as e:
logging.exception(e)
except BulkWriteError as bre:
logging.error(pprint(bre.details))
If I am loading the sample rows of 10 into Mongo collections, All the documents are having the same value of 10th row. I knew its because of python dictionary reference problem .
Can you please anyone give me suggestion on that ?
def BulkLoad(self, Dataset):
counter = 0;
empty = '000'
columns = []
records = []
for col in Dataset.columns:
columns.append(col)
try:
db = self.getDatabase()
bulk = db.collection.initialize_ordered_bulk_op()
for j in range(len(Dataset)):
DataDict = {}
DataDict.update(
{'CreatedBy': empty, 'ModifiedBy': empty, 'Value': Score})
for column in columns:
colValue = str(Dataset[column][j])
if (colValue == 'nan'):
colValue = colValue.replace('nan', '')
DataDict.update({column: colValue})
records.append(DataDict)
print("list is ",records)
Id = DataDict['Id']
Number = DataDict['Number']
print(DataDict)
bulk.find(
{'Id': Id, 'Number': Number}).upsert().update(
{
'$set': {'Id': Id, 'Number': Number,'Events':records}
})
counter += 1
if counter % 1000 == 0:
result = bulk.execute()
logging.info(pprint(result))
bulk = db.coll.initialize_ordered_bulk_op()
if counter % 1000 != 0:
result = bulk.execute()
logging.info(pprint(result))
except Exception as e:
logging.exception(e)
except BulkWriteError as bre:
logging.error(pprint(bre.details))
Related
I want to iterate through rows of a dataframe, I have read this datafarame from an Excel file, and it has 3 columns, I tried to get data of each row and convert it to a dict then store it in a list.
Dict format: {"subreddit_group": string, "links/caption": list, "Subreddits/flair": list}
my code:
data_list = []
linkcap_list = []
subredditflair_list = []
grp = ""
data = pd.read_excel("data.xlsx")
df = pd.DataFrame(data, columns=["subreddit_group", "links/caption", "subreddits/flair"])
data_raw = {}
for index, row in df.iterrows():
grp = row["subreddit_group"]
elms = row['links/caption'].split(",,")
for elm in elms:
elm_linkcap = {}
try:
link, cap = elm.split("^")
elm_linkcap = {"link": link, "caption": cap}
linkcap_list.append(elm_linkcap)
except ValueError as e:
elm_linkcap = {"link": elm, "caption": "none"}
linkcap_list.append(elm_linkcap)
elms = row['subreddits/flair'].split(",")
for elm in elms:
elm_subredditflair = {}
try:
subreddit, flair = elm.split("^")
elm_subredditflair = {"subreddit": subreddit, "flair": flair}
subredditflair_list.append(elm_subredditflair)
except ValueError as e:
elm_subredditflair = {"subreddit": elm, "flair": "none"}
subredditflair_list.append(elm_subredditflair)
data_raw = {"group": grp, "links_caps": linkcap_list, "subreddits_flairs": subredditflair_list}
data_list.append(data_raw)
I want to get the value of that column in each row, but every time I iterate through a row I get the value of the whole column I tried adding an index after the column name to access a specific cell, but I get a weird result, like this for example:
row['links/caption'][1]
how to iterate through a dataframe rows and access the value of each column, not the value of the whole column, I hope I was clear enough with my question, Thanks.
edit1:
using tuples.
data_list = []
linkcap_list = []
subredditflair_list = []
grp = ""
data = pd.read_excel("data.xlsx")
df = pd.DataFrame(data, columns=["subreddit_group", "links/caption", "subreddits/flair"])
data_raw = {}
for row in df.itertuples():
print(row)
grp = row[1]
elms_linkscaps = row[2].split(",,")
elms_subredditsflairs = row[3].split(",")
print(elms_subredditsflairs)
print(elms_linkscaps)
exit()
for elm in elms_linkscaps:
elm_linkcap = {}
try:
link, cap = elm.split("^")
elm_linkcap = {"link": link, "caption": cap}
linkcap_list.append(elm_linkcap)
except ValueError as e:
elm_linkcap = {"link": elm, "caption": "none"}
linkcap_list.append(elm_linkcap)
for elm in elms_subredditsflairs:
elm_subredditflair = {}
try:
subreddit, flair = elm.split("^")
elm_subredditflair = {"subreddit": subreddit, "flair": flair}
subredditflair_list.append(elm_subredditflair)
except ValueError as e:
elm_subredditflair = {"subreddit": elm, "flair": "none"}
subredditflair_list.append(elm_subredditflair)
data_raw = {"group": grp, "links_caps": linkcap_list, "subreddits_flairs": subredditflair_list}
data_list.append(data_raw)
I'm new in python and sqlalchemy.
I already have a delete method working if I construct the where conditions by hand.
Now, I need to read the columns and values from an enter request in yaml format and create the where conditions.
#enter data as yaml
items:
- item:
table: [MyTable,OtherTable]
filters:
field_id: 1234
#other_id: null
Here is what I try and can't go ahead:
for i in use_case_cfg['items']:
item = i.get('item')
for t in item['table']:
if item['filters']:
filters = item['filters']
where_conditions = ''
count = 0
for column, value in filters.items():
aux = str(getattr(t, column) == bindparam(value))
if count == 0:
where_conditions += aux
else:
where_conditions += ', ' + aux
count += 1
to_delete = inv[t].__table__.delete().where(text(where_conditions))
#to_delete = t.__table__.delete().where(getattr(t, column) == value)
else:
to_delete = inv[t].__table__.delete()
CoreData.session.execute(to_delete)
To me, it looks ok, but when I run, I got the error below:
sqlalchemy.exc.StatementError: (sqlalchemy.exc.InvalidRequestError) A value is required for bind parameter '9876'
[SQL: DELETE FROM MyTable WHERE "MyTable".field_id = %(1234)s]
[parameters: [{}]]
(Background on this error at: http://sqlalche.me/e/cd3x)
Can someone explain to me what is wrong or the proper way to do it?
Thanks.
There are two problems with the code.
Firstly,
str(getattr(t, column) == bindparam(value))
is binding the value as a placeholder, so you end up with
WHERE f2 = :Bob
but it should be the name that maps to the value in filters (so the column name in your case), so you end up with
WHERE f2 = :f2
Secondly, multiple WHERE conditions are being joined with a comma, but you should use AND or OR, depending on what you are trying to do.
Given a model Foo:
class Foo(Base):
__tablename__ = 'foo'
id = sa.Column(sa.Integer, primary_key=True)
f1 = sa.Column(sa.Integer)
f2 = sa.Column(sa.String)
Here's a working version of a segment of your code:
filters = {'f1': 2, 'f2': 'Bob'}
t = Foo
where_conditions = ''
count = 0
for column in filters:
aux = str(getattr(t, column) == sa.bindparam(column))
if count == 0:
where_conditions += aux
else:
where_conditions += ' AND ' + aux
count += 1
to_delete = t.__table__.delete().where(sa.text(where_conditions))
print(to_delete)
session.execute(to_delete, filters)
If you aren't obliged to construct the WHERE conditions as strings, you can do it like this:
where_conditions = [(getattr(t, column) == sa.bindparam(column))
for column in filters]
to_delete = t.__table__.delete().where(sa.and_(*where_conditions))
session.execute(to_delete, filters)
I'm working on database that uses lot of data. One invoice could have 7482 different articles. Validating invoice cost so much time, it took 26 minutes to validate one with 7482 articles. I find the method that take time to finish, it is the "action_move_create" inside "odoo\addons\account\models\account_invoice.py".
#api.multi
def action_move_create(self):
""" Creates invoice related analytics and financial move lines """
account_move = self.env['account.move']
for inv in self:
if not inv.journal_id.sequence_id:
raise UserError(_('Please define sequence on the journal related to this invoice.'))
if not inv.invoice_line_ids.filtered(lambda line: line.account_id):
raise UserError(_('Please add at least one invoice line.'))
if inv.move_id:
continue
if not inv.date_invoice:
inv.write({'date_invoice': fields.Date.context_today(self)})
if not inv.date_due:
inv.write({'date_due': inv.date_invoice})
company_currency = inv.company_id.currency_id
# create move lines (one per invoice line + eventual taxes and analytic lines)
iml = inv.invoice_line_move_line_get()
iml += inv.tax_line_move_line_get()
diff_currency = inv.currency_id != company_currency
# create one move line for the total and possibly adjust the other lines amount
total, total_currency, iml = inv.compute_invoice_totals(company_currency, iml)
name = inv.name or ''
if inv.payment_term_id:
totlines = inv.payment_term_id.with_context(currency_id=company_currency.id).compute(total, inv.date_invoice)[0]
res_amount_currency = total_currency
for i, t in enumerate(totlines):
if inv.currency_id != company_currency:
amount_currency = company_currency._convert(t[1], inv.currency_id, inv.company_id, inv._get_currency_rate_date() or fields.Date.today())
else:
amount_currency = False
# last line: add the diff
res_amount_currency -= amount_currency or 0
if i + 1 == len(totlines):
amount_currency += res_amount_currency
iml.append({
'type': 'dest',
'name': name,
'price': t[1],
'account_id': inv.account_id.id,
'date_maturity': t[0],
'amount_currency': diff_currency and amount_currency,
'currency_id': diff_currency and inv.currency_id.id,
'invoice_id': inv.id
})
else:
iml.append({
'type': 'dest',
'name': name,
'price': total,
'account_id': inv.account_id.id,
'date_maturity': inv.date_due,
'amount_currency': diff_currency and total_currency,
'currency_id': diff_currency and inv.currency_id.id,
'invoice_id': inv.id
})
part = self.env['res.partner']._find_accounting_partner(inv.partner_id)
line = [(0, 0, self.line_get_convert(l, part.id)) for l in iml]
line = inv.group_lines(iml, line)
line = inv.finalize_invoice_move_lines(line)
date = inv.date or inv.date_invoice
move_vals = {
'ref': inv.reference,
'line_ids': line,
'journal_id': inv.journal_id.id,
'date': date,
'narration': inv.comment,
}
move = account_move.create(move_vals)
# Pass invoice in method post: used if you want to get the same
# account move reference when creating the same invoice after a cancelled one:
move.post(invoice = inv)
# make the invoice point to that move
vals = {
'move_id': move.id,
'date': date,
'move_name': move.name,
}
inv.write(vals)
return True
Could you suggest some solutions?
We suppose that the hardware is efficient to run odoo correctly.
I optimize it by using raw sql query. I made these codes in account.invoice model:
The first one is the definition of _mock_create_move_line (called in action_move_create).
def _mock_create_move_line(self, model, values, move):
bad_names = ["analytic_line_ids", "tax_ids", "analytic_tag_ids"]
other_fields = [
"currency_id", "debit", "credit", "balance",
"debit_cash_basis", "credit_cash_basis", "balance_cash_basis",
"company_currency_id", "amount_residual",
"amount_residual_currency", "tax_base_amount", "reconciled",
"company_id", "counterpart"
]
cr = self.env.cr
quote = '"{}"'.format
columns = []
columns1 = []
for i, v in enumerate(values):
v = model._add_missing_default_values(v)
account_id = self.env['account.account'].browse(v['account_id'])
# compulsory columns and some stored related columns
# related fields are not triggered, krrrrr
v.update({
'move_id': move.id,
'date_maturity': move.date,
'company_id': account_id.company_id.id,
'date': move.date,
'journal_id': move.journal_id.id,
'user_type_id': account_id.user_type_id.id,
'create_uid': self.env.uid,
'create_date': fields.Datetime.now()
})
######
temp_column = []
for name, val in sorted(v.items()):
if name in bad_names:
continue
field = model._fields[name]
if field.column_type:
col_val = field.convert_to_column(val, model, v)
temp_column.append(col_val)
if not i:
columns1.append((name, field.column_format, col_val))
columns.append(tuple(temp_column))
model.check_access_rule('create')
try:
query = "INSERT INTO {} ({}) VALUES {} RETURNING id".format(
quote(model._table),
", ".join(quote(name) for name, fmt, val in columns1),
", ".join('%s' for fmt in columns),
)
cr.execute(query, columns)
ids = cr.fetchall()
# clear the model cache to take account of the new insertion
# if not executed, relationnal field will not be updated
model.invalidate_cache()
account_move_line_ids = model.browse(ids)
account_move_line_ids.modified(other_fields)
account_move_line_ids.recompute()
# update parent_path
account_move_line_ids._parent_store_create()
except Exception as e:
_logger.info(e)
cr.rollback()
return
The second one is the overriding of native method action_move_create. I make some modification, call _mock_create_move_line if there is 'raw_sql' in the context.
#api.multi
def action_move_create(self):
""" Creates invoice related analytics and financial move lines """
# TODO : make choice between ORM or raw sql according to the context
account_move = self.env['account.move']
for inv in self:
if not inv.journal_id.sequence_id:
raise UserError(_('Please define sequence on the journal related to this invoice.'))
if not inv.invoice_line_ids.filtered(lambda line: line.account_id):
raise UserError(_('Please add at least one invoice line.'))
if inv.move_id:
continue
if not inv.date_invoice:
inv.write({'date_invoice': fields.Date.context_today(self)})
if not inv.date_due:
inv.write({'date_due': inv.date_invoice})
company_currency = inv.company_id.currency_id
# create move lines (one per invoice line + eventual taxes and analytic lines)
iml = inv.invoice_line_move_line_get()
iml += inv.tax_line_move_line_get()
diff_currency = inv.currency_id != company_currency
# create one move line for the total and possibly adjust the other lines amount
total, total_currency, iml = inv.compute_invoice_totals(company_currency, iml)
name = inv.name or ''
if inv.payment_term_id:
totlines = \
inv.payment_term_id.with_context(currency_id=company_currency.id).compute(total, inv.date_invoice)[0]
res_amount_currency = total_currency
for i, t in enumerate(totlines):
if inv.currency_id != company_currency:
amount_currency = company_currency._convert(t[1], inv.currency_id, inv.company_id,
inv._get_currency_rate_date() or fields.Date.today())
else:
amount_currency = False
# last line: add the diff
res_amount_currency -= amount_currency or 0
if i + 1 == len(totlines):
amount_currency += res_amount_currency
iml.append({
'type': 'dest',
'name': name,
'price': t[1],
'account_id': inv.account_id.id,
'date_maturity': t[0],
'amount_currency': diff_currency and amount_currency,
'currency_id': diff_currency and inv.currency_id.id,
'invoice_id': inv.id
})
else:
iml.append({
'type': 'dest',
'name': name,
'price': total,
'account_id': inv.account_id.id,
'date_maturity': inv.date_due,
'amount_currency': diff_currency and total_currency,
'currency_id': diff_currency and inv.currency_id.id,
'invoice_id': inv.id
})
part = self.env['res.partner']._find_accounting_partner(inv.partner_id)
line = [(0, 0, self.line_get_convert(l, part.id)) for l in iml]
line = inv.group_lines(iml, line)
line = inv.finalize_invoice_move_lines(line)
date = inv.date or inv.date_invoice
if self.env.context.get('raw_sql', None):
move_vals = {
'ref': inv.reference,
'journal_id': inv.journal_id.id,
'date': date,
'narration': inv.comment,
}
# remove (0, 0, ...)
# override the group_lines method to avoid looping on next instruction
new_lines = [nl[2] for nl in line]
# TODO do not call compute here, add with ...norecompute()
move = account_move.create(move_vals)
move.env.cr.commit()
self._mock_create_move_line(self.env['account.move.line'], new_lines, move)
# Pass invoice in method post: used if you want to get the same
# account move reference when creating the same invoice after a cancelled one:
# compute move, it is not triggered automatically bc raw sql insertion
# is it correct to call it like this ? find better way
move._amount_compute()
move._compute_partner_id()
move._compute_matched_percentage()
else:
# make default behavior
move_vals = {
'ref': inv.reference,
'line_ids': line,
'journal_id': inv.journal_id.id,
'date': date,
'narration': inv.comment,
}
move = account_move.create(move_vals)
move.post(invoice=inv)
# make the invoice point to that move
vals = {
'move_id': move.id,
'date': date,
'move_name': move.name,
}
inv.write(vals)
return True
Now, the execution time is less than 1 minutes for about 7000 records to insert inside invoice.move.line
I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')
I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)
I am trying to insert records into a table, but only last record(result data) from the loop is inserting into the table
Here is the code i tried:
CDates = ['2020-05-10','2020-05-12','2020-05-13','2020-05-16','2020-05-20']
ResultData = {}
for date in CDates:
filterDate = Key('Date').eq(id)
appResponse = appTable.scan(FilterExpression = filterDate)
accResp = table.query(KeyConditionExpression = Key('PrimaryId').eq('Key'),FilterExpression = Key('Date').eq(date))
if len(accResp['Items']) == 0:
ResultData['PrimaryId'] = 'Key'
ResultData['CreatedDate'] = date
ResultData['Type'] = 'Appt'
ResultData['Id'] = str(uuid.uuid4())
print(ResultData)
table.put_item(Item=ResultData)
Not getting where did I go wrong
You assigned ResultData outside of the loop and changed the values for the same keys every time the loop ran. Try this:
CDates = ['2020-05-10', '2020-05-12', '2020-05-13', '2020-05-16', '2020-05-20']
for date in CDates:
filterDate = Key('Date').eq(id)
appResponse = appTable.scan(FilterExpression=filterDate)
accResp = table.query(
KeyConditionExpression=Key('PrimaryId').eq('Key'),
FilterExpression=Key('Date').eq(date))
if len(accResp['Items']) == 0:
ResultData = {
'PrimaryId': 'Key',
'CreationDate': date,
'Type': 'Appt',
'Id': str(uuid.uuid4())
}
print(ResultData)
table.put_item(Item=ResultData)