I am scraping a shopify store using the products.json page. Attempting to insert the scraped products into my MySQL DB using the Python connector, but hitting below error:
Something went wrong: Failed executing the operation; b'Name'
Code is below:
import json
import pandas as pd
import mysql.connector
import ScraperConfig as conf
class myScraper():
def __init__(self, baseurl):
self.baseurl = baseurl
def downloadjson(self, page):
r = requests.get(self.baseurl + f'products.json?limit=250&page={page}', timeout=5)
if r.status_code != 200:
print('Bad status code', r.status_code)
if len(r.json()['products']) > 0:
data = r.json()['products']
return data
else:
return
def parsejson(self, jsondata):
products = []
for prod in jsondata:
vendor = prod['vendor']
name = prod['title']
handle = prod['handle']
createdDateTime = prod['created_at']
description = prod['body_html']
productType = prod['product_type']
for images in prod['images']:
vendorProductId = images['product_id']
try:
imageURL = images['src']
except:
imageURL = 'None'
for variant in prod['variants']:
item = {
'name': name,
'handle': handle,
'description': description,
'productVariantId': variant['id'],
'createdDateTime': createdDateTime,
'productType': productType,
'vendorProductId': vendorProductId,
'imageURL': imageURL,
'price': variant['price'],
'salePrice': variant['compare_at_price'],
'available': variant['available'],
'updatedDateTime': variant['updated_at'],
'vendor': vendor
}
products.append(item)
return products
def main():
scrape = Scraper('https://www.someshopifysite.com/')
results = []
for page in range(1,2):
data = scrape.downloadjson(page)
print('Getting page: ', page)
try:
results.append(scrape.parsejson(data))
except:
print(f'Completed, total pages = {page - 1}')
break
return results
if __name__ == '__main__':
db = mysql.connector.connect(
user=conf.user,
host=conf.host,
passwd=conf.passwd,
database=conf.database)
cursor = db.cursor()
products = main()
totals = [item for i in products for item in i]
for p in totals:
sql = """INSERT INTO `table` (`Name`, `Handle`, `Descritpion`, `VariantId`, `CreatedDateTime`, `ProductType`, `VendorProductId`, `ImageURL`, `Price`, `SalePrice`, `Available`, `UpdatedDateTime`, `Vendor`)
VALUES (%(`Name`)s, %(`Handle`)s, %(`Descritpion`)s, %(`VariantId`)s, %(`CreatedDateTime`)s, %(`ProductType`)s, %(`VendorProductId`)s, %(`ImageURL`)s, %(`Price`)s, %(`SalePrice`)s, %(`Available`)s, %(`UpdatedDateTime`)s, %(`Vendor`)s)"""
try:
cursor.executemany(sql, totals)
print('Committed to DB')
except mysql.connector.Error as err:
print("Something went wrong: {}".format(err))
db.commit() ```
Remove backticks from the following and all similar parts of the query:
%(`Name`)s
In general I'd remove backticks except for quoting column names that map to keywords.
Related
I have a python application where a client retrieves csv data row by row from a server using a grpc stream. Data from each row is added to a dictionary which in turn is saved to a redis database. When I attempt to retrieve the data from the redis database in a seperate flask application, the data is not coming out in order and is duplicating much of the time. How can I retrieve the data in order of the key without duplicates?
Client
def run():
#Average number of comments metric
average_num_comments = 0
response_count = 0
comment_count = 0
try:
conn = redis.StrictRedis(host='redis', port=6379)
conn.flushdb()
except Exception as ex:
print('Error:', ex)
while True:
with grpc.insecure_channel('redditserver:50051') as channel:
stub = route_guide_pb2_grpc.RouteGuideStub(channel)
responses = stub.SendRedditPost(route_guide_pb2.PostRequestReddit(response='Recieved'))
#Single post with most letters in title
lg_post_title = ''
for response in responses:
response_count += 1
comment_count = int(response.num_comments) + comment_count
average_num_comments = avg(response_count, comment_count)
if (len(response.title) > len(lg_post_title)):
lg_post_title = response.title
redisdict = {"Largest Post Title": lg_post_title, "Comment Count": comment_count, "Average No. Comments": average_num_comments}
try:
conn = redis.StrictRedis(host='redis', port=6379)
conn.hmset(response_count, redisdict)
except Exception as ex:
print('Error:', ex)
time.sleep(2)
Flask Application
[![#app.route('/')
def get_page():
data = ''
try:
conn = redis.StrictRedis(host='redis', port=6379, decode_responses=True)
for key in conn.scan_iter():
value = conn.hgetall(key)
data = value
time.sleep(2)
print("KEY: " + key, file=sys.stderr)
print(data, file=sys.stderr)
except Exception as ex:
data = 'Error:' + str(ex)
return render_template('index.html', x=data)][1]][1]
i've code with the following structure from a website i'm scraping data:
destinationAccount:
ownerBuilding: ( collapse to destinationAccount)
label: ( collapse to ownerBuilding )
_id: ( collapse to ownerBuilding )
vban: ( collapse to destinationAccount)
_id: ( collapse to destinationAccount)
When I try to read this Key with this
vban = str(transaction["destinationAccount"]["vban"])
It gives me KeyError: 'destinationAccount'
Anyone have an Idea why this comes up? When I run my Code, it will copy everything I need into the MySQL Database but as I already said, the KeyError popup and the Interval isn't working
sched = BlockingScheduler()
sched.add_job(start, 'interval', seconds=5)
sched.start()
because it stop runing after the error appears. When I comment out this one vban = str(transaction["destinationAccount"]["vban"]) no error is coming up. I checked now more than 10 times, the structure is there on the website as I showed at the top. Any solution would be amazing.
def getData():
databaseConn = dbConnect()
cursor = databaseConn.cursor()
for x in range(3):
x = x * 25
transactions = json.loads(makeRequest("URL.bla/transactions?offset=" + str(x), authToken, True).text)
for transaction in transactions:
person = ""
try:
person = transaction["destinationAccount"]["ownerCharacter"]["name"]
except:
try:
person = transaction["destinationAccount"]["ownerFactory"]["label"]
except:
try:
person = transaction["destinationAccount"]["ownerBuilding"]["label"]
except:
person = str("unbekannt")
reference = ""
try:
reference = str(translateTable[transaction["reference"]])
except:
reference = str(transaction["reference"])
vban = str(transaction["destinationAccount"]["vban"])
amount = str(transaction["amount"])
taxAmount =str(transaction["taxAmount"])
gesamt = (float(amount) + float(taxAmount))
created = parse(str(transaction["created"]))
date = str(created.date())
time = str(created.time()).split(".")[0]
sql = "INSERT INTO finanzen (transaktion, date, time, sendto, vban, amount, tax, gesamt, text) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
val = (str(transaction["uuid"]), date, time, str(person), vban, amount, taxAmount, gesamt, reference)
try:
cursor.execute(sql, val)
databaseConn.commit()
except:
print("Fehler Datenbank")
dbClose(databaseConn,cursor)
Print result :
{'_id': 'CENSORED',
'uuid': 'CENSORED',
'amount': 11.8421,
'taxAmount': 3.1479,
'type': 'digital',
'created': 'Date',
'reference': 'CENSORED',
'sourceAccount': {'_id': 'CENSORED',
'ownerCharacter': {'_id': 'CENSORED',
'name': 'NAME'},
'vban': 'NUMBER'},
'destinationAccount': {'_id': 'CENSORED',
'vban': 'NUMBER',
'ownerBuilding': {'_id': 'CENSORED',
'label': 'Eclipse Towers'}}}
Difficult without seeing the full list but I suspect some of the items are missing the key. Have you tried a check on the key existing. Using your example:
transaction = {
"_id":"CENSORED",
"uuid":"CENSORED",
"amount":11.8421,
"taxAmount":3.1479,
"type":"digital",
"created":"Date",
"reference":"CENSORED",
"sourceAccount":{
"_id":"CENSORED",
"ownerCharacter":{
"_id":"CENSORED",
"name":"NAME"
},
"vban":"NUMBER"
},
"destinationAccount":{
"_id":"CENSORED",
"ownerBuilding":{
"_id":"CENSORED",
"label":"Eclipse Towers"
}
}
}
if 'vban' in transaction['destinationAccount']:
vban = str(transaction["destinationAccount"]["vban"])
else:
vban = "none"
Thanks to #Johnny John Boy for the Hint.
vban = ""
try:
vban = str(transaction["destinationAccount"]["vban"])
except:
try:
vban = str(transaction["sourceAccount"]["vban"])
except:
vban = str("Unbekannt")
This is the solution to fix the KeyError because there was a second part. now it works as it should without any error.
I have the following code
client = bigquery.Client()
dataset_id = 'dataset' # replace with your dataset ID
table_id = 'table' # replace with your table ID
table_ref = client.dataset(dataset_id).table(table_id)
table = client.get_table(table_ref) # API request
rows_to_insert = []
bq = bigquery.Client(project='project-id')
query = """SELECT Url FROM `project-id.dataset.urltable`"""
query_job = bq.query(query)
data = query_job.result()
rows = list(data)
def main():
for row in rows:
URL = urllib.request.urlopen(row[0])
soup_page = soup(URL, features="lxml")
try:
data = json.loads(soup_page.find_all('script', type='application/ld+json')[1].text)
except:
data ='unknown'
try:
price_ruw = data['offers']['price']
shopprice = price_ruw.replace(',','.')
except:
price = 0
try:
ean = data['gtin13']
ean = str(ean)
except:
ean = 'unknown'
try:
title_ruw1 = data['name']
title_ruw = title_ruw1
tile_trim = title_ruw[:750]
title = tile_trim.replace("'", "")
except:
title = "unknown"
try:
reviews = data['aggregateRating']['reviewCount']
except:
reviews = 0
try:
score = (float(data['aggregateRating']['ratingValue']) * 2)
except:
score = 0
datenow = (datetime.datetime.now())
shoplink = row[0]
rows_to_insert.append([shoplink,ean,title,reviews,score,shopprice,datenow])
client.insert_rows(table, rows_to_insert) # API request
main()
Testing this code in Google Cloud platform gives
Error: function crashed. Details:
main() takes 0 positional arguments but 2 were given
However when deploying this code it does not give an error. Only scheduling this query does not work since it keeps giving the error below.
For deploying i use the following command (which works)
gcloud functions deploy <function> --entry-point main --
runtime python37 --trigger-resource <name> --trigger-event google.pubsub.topic.publish --timeout 540s
It's not clear how you're trigging this function, but it seems like a "Background Function", which means that it needs to take two arguments, even if they're unused:
def main(data, context):
...
See https://cloud.google.com/functions/docs/concepts/events-triggers for more information.
I'm having an issue using python 3 and concurrent.futures ProcessPoolExecutor and the map function.
My code is this:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import psycopg2
import psycopg2.extensions
import psycopg2.extras
from asq import query
import select
import concurrent.futures
import asyncio
class UpdateElastic:
def __init__(self):
conn = psycopg2.connect(
"dbname=db user=mad password=hat host=blah",
async_=True
)
self.wait(conn)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("SELECT * FROM table",)
self.wait(cur.connection)
self.report_files = cur.fetchall()
cur.execute("SELECT * FROM othertable",)
self.wait(cur.connection)
self.payment_events = cur.fetchall()
cur.close()
conn.close()
self.esconn = Elasticsearch([{'host':'elasticsearch.example.com','port':1234}])
# pass
def wait(self, conn):
while 1:
state = conn.poll()
if state == psycopg2.extensions.POLL_OK:
break
elif state == psycopg2.extensions.POLL_WRITE:
select.select([], [conn.fileno()], [])
elif state == psycopg2.extensions.POLL_READ:
select.select([conn.fileno()], [], [])
else:
raise psycopg2.OperationalError("poll() returned %s" % state)
def get_es_indices(self):
indices = self.esconn.indices.get_alias("digital-sales-csv*")
return list(indices.keys())
def update_documents(self, index, scroll_id=None):
print(index)
# return index
# documents = _get_es_documents(conn, index)
# print(documents['_scroll_id'])
# scroll_id = documents['_scroll_id']
# for document in documents['hits']['hits']:
# ids = {
# "report_id": document['_source']['report_id'],
# "payment_id": document['_source']['payment_id'],
# "document_id": document['_id']
# }
# asyncio.run(_update_es_document(conn, index, report_files, payment_events, ids))
# update_documents(index, conn, report_files, payment_events, scroll_id)
def main():
print('main called')
print('instantiating UpdateElastic')
us = UpdateElastic()
print('UpdateElastic instantiated')
print('setting up ProcessPoolExecutor')
blah = ['abc', 'def', 'ghi']
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
print('calling executor.map')
executor.map(us.update_documents, blah, timeout=10)
if __name__ == "__main__":
main()
With this code, all I'm expecting it to do is print out the values of the array that I've passed, so:
'abc'
'def'
'ghi'
However, after printing: calling executor.map, it hangs.
When i change my constructor to be:
class UpdateElastic:
def __init__(self):
# conn = psycopg2.connect(
# "dbname=db user=mad password=hat host=blah",
# async_=True
# )
# self.wait(conn)
# cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
# cur.execute("SELECT * FROM table",)
# self.wait(cur.connection)
# self.report_files = cur.fetchall()
# cur.execute("SELECT * FROM othertable",)
# self.wait(cur.connection)
# self.payment_events = cur.fetchall()
# cur.close()
# conn.close()
# self.esconn = Elasticsearch([{'host':'elasticsearch.example.com','port':1234}])
pass
(containing only a "pass" in the constructor), it will actually print out the values of the array, as expected.
I'm running this on python 3.7.3, on OSX Mojave 10.14.2.
I'm trying to migrate some models from OpenERP 7 to Odoo 8 by code. I want to insert objects into new table maintaining the original id number, but it doesn't do it.
I want to insert the new object including its id number.
My code:
import openerp
from openerp import api, modules
from openerp.cli import Command
import psycopg2
class ImportCategory(Command):
"""Import categories from source DB"""
def process_item(self, model, data):
if not data:
return
# Model structure
model.create({
'id': data['id'],
'parent_id': None,
'type': data['type'],
'name': data['name']
})
def run(self, cmdargs):
# Connection to the source database
src_db = psycopg2.connect(
host="127.0.0.1", port="5432",
database="db_name", user="db_user", password="db_password")
src_cr = src_db.cursor()
try:
# Query to retrieve source model data
src_cr.execute("""
SELECT c.id, c.parent_id, c.name, c.type
FROM product_category c
ORDER BY c.id;
""")
except psycopg2.Error as e:
print e.pgerror
openerp.tools.config.parse_config(cmdargs)
dbname = openerp.tools.config['db_name']
r = modules.registry.RegistryManager.get(dbname)
cr = r.cursor()
with api.Environment.manage():
env = api.Environment(cr, 1, {})
# Define target model
product_category = env['product.category']
id_ptr = None
c_data = {}
while True:
r = src_cr.fetchone()
if not r:
self.process_item(product_category, c_data)
break
if id_ptr != r[0]:
self.process_item(product_category, c_data)
id_ptr = r[0]
c_data = {
'id': r[0],
'parent_id': r[1],
'name': r[2],
'type': r[3]
}
cr.commit()
How do I do that?
The only way I could find was to use reference attributes in others objects to relate them in the new database. I mean create relations over location code, client code, order number... and when they are created in the target database, look for them and use the new ID.
def run(self, cmdargs):
# Connection to the source database
src_db = psycopg2.connect(
host="localhost", port="5433",
database="bitnami_openerp", user="bn_openerp", password="bffbcc4a")
src_cr = src_db.cursor()
try:
# Query to retrieve source model data
src_cr.execute("""
SELECT fy.id, fy.company_id, fy.create_date, fy.name,
p.id, p.code, p.company_id, p.create_date, p.date_start, p.date_stop, p.special, p.state,
c.id, c.name
FROM res_company c, account_fiscalyear fy, account_period p
WHERE p.fiscalyear_id = fy.id AND c.id = fy.company_id AND p.company_id = fy.company_id
ORDER BY fy.id;
""")
except psycopg2.Error as e:
print e.pgerror
openerp.tools.config.parse_config(cmdargs)
dbname = openerp.tools.config['db_name']
r = modules.registry.RegistryManager.get(dbname)
cr = r.cursor()
with api.Environment.manage():
env = api.Environment(cr, 1, {})
# Define target model
account_fiscalyear = env['account.fiscalyear']
id_fy_ptr = None
fy_data = {}
res_company = env['res.company']
r = src_cr.fetchone()
if not r:
self.process_fiscalyear(account_fiscalyear, fy_data)
break
company = res_company.search([('name','like',r[13])])
print "Company id: {} | Company name: {}".format(company.id,company.name)
The previous code is only an extract from the whole source code.