Populating a MySQL table with scraped data - python

I'm using Python 3, MySQL, Sequel Pro and BeautifulSoup.
Put simply, I want to create a SQL table and then insert my downloaded data into that data.
I've used this answer as a template to build the SQL part Beautiful soup webscrape into mysql, but it won't work.
Errors thrown:
line 86 finally:SyntaxError: invalid syntax
When I comment out this last finally: (just see if the rest of the code worked) I get:
InternalError: (1054, "Unknown column 'address' in 'field list'")
Another common error I got was:
ProgrammingError: (1146, "Table 'simple_scrape.simple3' doesn't exist",
though I can't remember the exact changes I made to end up with this error.
Finally- I started to learn programming (not just Python, but 'programming') less than four weeks ago- if you're wondering why I've done something stupid or inefficient it's almost certainly because that was the first way I got it to work!
Please help!
Code:
from selenium import webdriver
#Guess BER Number
for i in range(108053983,108053985):
try:
# ber_try = 100000000
ber_try =+i
#Open page & insert BER Number
browser = webdriver.Firefox()
type(browser)
browser.get('https://ndber.seai.ie/pass/ber/search.aspx')
ber_send = browser.find_element_by_id('ctl00_DefaultContent_BERSearch_dfSearch_txtBERNumber')
ber_send.send_keys(ber_try)
#click search
form = browser.find_element_by_id('ctl00_DefaultContent_BERSearch_dfSearch_Bottomsearch')
form.click()
#click intermediate page
form = browser.find_element_by_id('ctl00_DefaultContent_BERSearch_gridRatings_gridview_ctl02_ViewDetails')
form.click()
#scrape the page
import bs4
soup = bs4.BeautifulSoup(browser.page_source)
# First Section
ber_dec = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsBER'})
address = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress'})
address = (address.get_text(', ').strip())
print(address)
date_issue = ber_dec.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfBER_container_DateOfIssue'})
date_issue = date_issue.get_text().strip()
print(date_issue)
except:
print('Invalid BER Number:', ber_try)
browser.quit()
#connecting to mysql
finally:
import pymysql.cursors
from pymysql import connect, err, sys, cursors
#Making the connection
connection = pymysql.connect(host = '127.0.0.1',
port = 3306,
user = 'root',
passwd = 'root11',
db = 'simple_scrape',
cursorclass=pymysql.cursors.DictCursor);
with connection.cursor() as cursor:
sql= """CREATE TABLE `simple3`(
(
`ID` INT AUTO_INCREMENT NOT NULL,
`address` VARCHAR( 200 ) NOT NULL,
`date_issue` VARCHAR( 200 ) NOT NULL,
PRIMARY KEY ( `ID` )
)Engine = MyISAM)"""
sql = "INSERT INTO `simple3` (`address`, `date_issue`) VALUES (%s, %s)"
cursor.execute(sql, (address, date_issue))
connection.commit()
finally:
connection.close()
browser.quit()

Issues:
And actually create the table
sql= """CREATE TABLE simple3(
(
ID INT AUTO_INCREMENT NOT NULL,
address VARCHAR( 200 ) NOT NULL,
date_issue VARCHAR( 200 ) NOT NULL,
PRIMARY KEY ( ID )
)Engine = MyISAM)"""
// Added this line since your table was not being created.
cursor.execute(sql)
sql = "INSERT INTO simple3 (address, date_issue) VALUES (%s, %s)"
cursor.execute(sql, (address, date_issue))

Related

mysql LOAD DATA INFILE of csv from python (Not Working)

After some data manipulation I store two columns in a txt file in a csv format as following:
result.txt ->
id,avg
0,38.0
1,56.5
3,66.5
4,48.666666666666664
then I store the data in a table, which is where i find trouble, i tried running a .sql query that stores the data successfully, but executing the same query from python doesnt seem to work for some reason.
python code->
.
.
.
open('C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/result.txt', 'w').write(res)
print(res)
try:
with mysql.connector.connect(
host="localhost",
user='root',
password='tt',
database="dp",
) as connection:
clear_table_query = "drop table if exists test_db.marks;"
create_table_query = '''
create table test_db.marks (
id varchar(255) not null,
avg varchar(255) not null,
primary key (id)
);
'''
# droping the table and recreating it works fine
add_csv_query = "LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/result.txt' INTO TABLE marks FIELDS TERMINATED BY ',' ENCLOSED BY '\"' LINES TERMINATED BY '\\n' IGNORE 1 LINES;"
print(add_csv_query) # query is printed correctly
with connection.cursor() as cursor:
cursor.execute(clear_table_query)
cursor.execute(create_table_query)
cursor.execute(add_csv_query)
cursor.execute("SELECT * FROM test_db.marks;") # this produces -> Unread result found
except mysql.connector.Error as e:
print(e)
connection.close()

How do I extract a nested tuple from a tuple?

I'm using snscrape to scrape instagram. snscrape returns the data in tuple format but it creates the instagram data in a nested tuple. eg.
for b in enumerate(sninstagram.InstagramUserScraper(username='houston_2731').get_items()):
print[(b)]
output
(0, InstagramPost(url='https://www.instagram.com/p/CUdFfjEImHN/', date=datetime.datetime(2021, 9, 30, 17, 39, 20, tzinfo=datetime.timezone.utc), content='"Hardwork plus patience. A symbol of my sacrifice I\'m doing waiting." Nipsey Hussle \n\nIt\'s hard to believe what 5 months and a disciplined diet and hitting the gym hard can do. The first pic in the collage is me at a challenging point in my life. Depression and what not but I had to snap out of it and get in the gym and do the work. As I continue to embark on this fitness journey. I hope to inspire some to join me on this journey. \n\n#fitness #weightloss #muscles #gymmotivation #gymrat #intermittentfasting #fitnessmotivation #fitnessjourney #tenpercentbodyfat #shredded #fitnessgoals #hardwork #patience #discipline #dedication #hunger', thumbnailUrl='https://instagram.fjnb12-1.fna.fbcdn.net/v/t51.2885-15/243385646_584565779558058_6508985384396360110_n.webp?stp=dst-jpg_e35_s640x640_sh0.08&_nc_ht=instagram.fjnb12-1.fna.fbcdn.net&_nc_cat=106&_nc_ohc=nrtaOwxdg64AX8NQE-Z&edm=ABfd0MgBAAAA&ccb=7-4&oh=00_AT_xE-O75IP4MezdzoHM_WxAgbXiivb3aBFUMopAkxxJSA&oe=621D237E&_nc_sid=7bff83', displayUrl='https://instagram.fjnb12-1.fna.fbcdn.net/v/t51.2885-15/243385646_584565779558058_6508985384396360110_n.webp?stp=dst-jpg_e35&_nc_ht=instagram.fjnb12-1.fna.fbcdn.net&_nc_cat=106&_nc_ohc=nrtaOwxdg64AX8NQE-Z&edm=ABfd0MgBAAAA&ccb=7-4&oh=00_AT8JXpM2XKqA_d06LV10Qy_Jt1GYnvpjUEeVZZMRIdwgnQ&oe=621D237E&_nc_sid=7bff83', username='houston_2731', likes=1, comments=0, commentsDisabled=False, isVideo=False))
Now for this reason specifically this output cannot be inserted into the database because it creates a value error caused by the nested tuple because of its type. the database doesn't recognize its type and then fails. so now what I want to do is extract the nested tuple and use it as the main tuple. How do I go about doing that?
class insta():
def instagram(self):
dbname = '******'
user = '******'
password = '******'
host = '******'
port = ****
cur = None
conn = None
try:
conn = psycopg2.connect(
dbname = dbname,
user = user,
password = password,
host = host,
port = port
)
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS Machine_instagram')
create_table = '''CREATE TABLE IF NOT EXISTS Machine_instagram (
id serial PRIMARY KEY,
url char,
date timestamp,
content char,
thumbnailUrl char,
displayUrl char,
username char,
likes int,
comments int,
commentsDisabled bool,
isVideo bool)'''
cur.execute(create_table)
for b in enumerate(sninstagram.InstagramUserScraper(username='houston_2731').get_items()):
insert_insta = 'INSERT INTO Machine_instagram (url, date, content,thumbnailUrl, displayUrl, username, likes, comments, commentsDisabled, isVideo) VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s, %s)'
insert_values = [(b)]
for records in insert_values:
cur.execute(insert_insta, records)
conn.commit()
print('completed')
except Exception as error:
print(error)
finally:
if cur is not None:
cur.close()
if conn is not None:
conn.close()
insta1 = insta()
insta1.instagram()

I'm trying to insert data that I've scrapped off twitter into my postgres database but failing

I'm using snscrape lib to scrape twitter data off twitter. I want to insert this data into my database but I seem to be failing no matter what method I try. when I use a loop and create a sql query after the loop to insert the values 1 by 1. I get an IndexError and a TypeError. When I try to append the data into a list. I can't loop in to each value 1 by 1. Now I'm stuck and don't know what to do.
method 1
class Tweet_list():
def tweets_list1(self):
dbname = '******'
user = '******'
password = '******'
host = '*******'
port = ****
cur = None
conn = None
try:
conn = psycopg2.connect(
dbname = dbname,
user = user,
password = password,
host = host,
port = port
)
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS Machine_twitter')
create_table = '''CREATE TABLE IF NOT EXISTS Machine_twitter (
id int PRIMARY KEY,
Tweet text,
Tweet_id int,
Timestamp timestamp,
Replys int,
Retweets int,
Likes int,
Username char)'''
cur.execute(create_table)
for i, tweet in enumerate(sntwitter.TwitterSearchScraper('from:TheHoopCentral').get_items()):
if i > 5:
break
insert_tweet = 'INSERT INTO Machine_twitter (Tweet, Tweet_id, Timestamp, Replys, Retweets, Likes, Username) VALUES (%s, %s, %s, %s,%s, %s, %s)'
insert_values = (tweet.content, tweet.id, tweet.date, tweet.replyCount, tweet.retweetCount, tweet.likeCount, tweet.user.username)
cur.execute(insert_tweet, insert_values)
conn.commit()
print('completed')
except Exception as error:
print(error)
finally:
if cur is not None:
cur.close()
if conn is not None:
conn.close()
tweets = Tweet_list()
tweets2 = Tweet_list()
tweets2.tweets_list1()
error
IndexError: list index out of range
method 2
def update_list1(self):
tweets_list2 = []
for i, tweet in enumerate(sntwitter.TwitterSearchScraper('from:TheHoopCentral').get_items()):
if i > 100:
break
tweets_list2.append([tweet.content, tweet.id,tweet.likeCount, tweet.retweetCount, tweet.replyCount, tweet.user.username])
tweet_df = pd.DataFrame(tweets_list2, columns=('tweet', 'tweet id', 'likeCount', 'retweetCount', 'replyCount', 'username'))
tweet_df.head()
the problem with the second method is that after the list gets appended. I can't access the values(eg. tweet.content) so I can insert them into the database. I've tried every method under the sun but I'm failing miserably can somebody help.

MYSQL not accepting values from insert statement

So im trying to insert values into a MYSQL database table but the following error keeps on popping up.
would really appreciate some help.
This is my code which i wrote to input a value from a file and store it in a database table.
import mysql.connector
import pickle
try:
connection = mysql.connector.connect(host='localhost',
database='PETROL',
user='Sarthak',
password='q1w2e3r4t5')
cursor = connection.cursor ( )
print(connection)
fp1 = open ("D:/Python/petrol/pdate/pdate.txt" , "rb+")
while True :
try :
pdate = pickle.load (fp1)
cursor.execute("DROP TABLE IF EXISTS DATES")
cursor.execute("CREATE TABLE DATES (ID INT AUTO_INCREMENT PRIMARY KEY,Date DATE)")
cursor.execute ("INSERT INTO DATES(Date) VALUES(pdate)")
cursor.execute("SHOW TABLES")
cursor.commit()
except EOFError :
fp6.close ()
except mysql.connector.Error as error:
print("Failed to create table in MySQL: {}".format(error))
cursor.close()
connection.close()
The following error keeps on popping up -:
Failed to create table in MySQL: 1054 (42S22): Unknown column 'pdate' in 'field list'
I am not able to encounter what problem is caused by the insert statement which i wrote.
Its better you write your insert statement like this
query = "INSERT INTO DATES(`Date`) VALUES(%s)" cursor.execute (query, pdate)
import pickle
try:
connection = mysql.connector.connect(host='localhost',
database='PETROL',
user='Sarthak',
password='q1w2e3r4t5')
cursor = connection.cursor ( )
print(connection)
fp1 = open ("D:/Python/petrol/pdate/pdate.txt" , "rb+")
while True :
try :
pdate = pickle.load (fp1)
cursor.execute("DROP TABLE IF EXISTS DATES")
cursor.execute("CREATE TABLE DATES (ID INT AUTO_INCREMENT PRIMARY KEY,Date DATE)")
query = "INSERT INTO DATES(`Date`) VALUES(%s)"
cursor.execute (query, pdate)
cursor.execute("SHOW TABLES")
connection.commit() #use connection.commit instead of cursor.commit
except EOFError :
fp6.close ()
except mysql.connector.Error as error:
print("Failed to create table in MySQL: {}".format(error))
cursor.close()
connection.close()```

Importing data from an excel file using python into SQL Server

I have found some other questions that have a similar error to what I am getting, but have not been able to figure out how to resolve this based on the answers. I am trying to import an excel file into SQL Server with the help of python. This is the code I wrote:
import pandas as pd
import numpy as np
import pandas.io.sql
import pyodbc
import xlrd
server = "won't disclose private info"
db = 'private info'
conn = pyodbc.connect('DRIVER={SQL Server};SERVER=' + Server + ';DATABASE=' +
db + ';Trusted_Connection=yes')
cursor = conn.cursor()
book = xlrd.open_workbook("Daily Flash.xlsx")
sheet = book.sheet_by_name("Sheet1")
query1 = """CREATE TABLE [LEAF].[MK] ([LEAF][Lease_Number] varchar(255),
[LEAF][Start_Date] varchar(255), [LEAF][Report_Status] varchar(255), [LEAF]
[Status_Date] varchar(255), [LEAF][Current_Status] varchar(255), [LEAF]
[Sales_Rep] varchar(255), [LEAF][Customer_Name] varchar(255),[LEAF]
[Total_Finance] varchar(255),
[LEAF][Rate_Class] varchar(255) ,[LEAF][Supplier_Name] varchar(255) ,[LEAF]
[DecisionStatus] varchar(255))"""
query = """INSERT INTO [LEAF].[MK] (Lease_Number, Start_Date, Report_Status,
Status_Date, Current_Status, Sales_Rep, Customer_Name,Total_Finance,
Rate_Class,Supplier_Name,DecisionStatus) VALUES (%s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s)"""
for r in range(1, sheet.nrows):
Lease_Number = sheet.cell(r,0).value
Start_Date = sheet.cell(r,1).value
Report_Status = sheet.cell(r,2).value
Status_Date = sheet.cell(r,3).value
Current_Status= sheet.cell(r,4).value
Sales_Rep = sheet.cell(r,5).value
Customer_Name = sheet.cell(r,6).value
Total_Financed= sheet.cell(r,7).value
Rate_Class = sheet.cell(r,8).value
Supplier_Name = sheet.cell(r,9).value
DecisionStatus= sheet.cell(r,10).value
values = (Lease_Number, Start_Date, Report_Status, Status_Date,
Current_Status, Sales_Rep, Customer_Name, Total_Financed, Rate_Class,
Supplier_Name, DecisionStatus)
cursor.execute(query1)
cursor.execute(query, values)
database.commit()
database.close()
database.commit()
The error message I get is:
ProgrammingError Traceback (most recent call last)
<ipython-input-24-c525ebf0af73> in <module>()
16
17 # Execute sql Query
---> 18 cursor.execute(query, values)
19
20 # Commit the transaction
ProgrammingError: ('The SQL contains 0 parameter markers, but 11 parameters
were supplied', 'HY000')
Can someone please explain the problem to me and how I can fix it? Thank you!
Update:
I have gotten that error message to go away based on the comments below. I modified my query also because the table into which I am trying to insert values into was not previously created, so I updated my code in an attempt to create it.
However, now I am getting the error message:
ProgrammingError: ('42000', '[42000] [Microsoft][ODBC SQL Server Driver][SQL
Server]The specified schema name "dbo" either does not exist or you do not
have permission to use it. (2760) (SQLExecDirectW)')
I tried changing that slightly by writing CREATE [HELLO][MK] instead of just CREATE MK but that tells me that MK is already in the database... What steps should I take next?
Based on the conversation we had in our chat, here are a few takeaways:
After executing your CREATE TABLE query, make sure to commit it immediately before running any subsequent INSERT queries.
Use error catching for cases when the table already exists in the database. You asked that if you wanted to import more data to the table, would the script still run. The answer is no, since Python will throw an exception at cursor.execute(query1).
If you want to validate whether your insert operations were successful, you can do a simple record count check.
EDIT
Yesterday, when I had #mkheifetz test my code out, he caught a minor bug where the validation check would return False, and the reason was because the database already had existing records, so when comparing against only the current data being imported, the validation would fail. Therefore, as a solution to address the bug, I have modified the code again.
Below is how I would modify your code:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas.io.sql
import pyodbc
import xlrd
server = 'XXXXX'
db = 'XXXXXdb'
# create Connection and Cursor objects
conn = pyodbc.connect('DRIVER={SQL Server};SERVER=' + server + ';DATABASE=' + db + ';Trusted_Connection=yes')
cursor = conn.cursor()
# read data
data = pd.read_excel('Flash Daily Apps through 070918.xls')
# rename columns
data = data.rename(columns={'Lease Number': 'Lease_Number',
'Start Date': 'Start_Date',
'Report Status': 'Report_Status',
'Status Date': 'Status_Date',
'Current Status': 'Current_Status',
'Sales Rep': 'Sales_Rep',
'Customer Name': 'Customer_Name',
'Total Financed': 'Total_Financed',
'Rate Class': 'Rate_Class',
'Supplier Name': 'Supplier_Name'})
# export
data.to_excel('Daily Flash.xlsx', index=False)
# Open the workbook and define the worksheet
book = xlrd.open_workbook("Daily Flash.xlsx")
sheet = book.sheet_by_name("Sheet1")
query1 = """
CREATE TABLE [LEAF].[ZZZ] (
Lease_Number varchar(255),
Start_Date varchar(255),
Report_Status varchar(255),
Status_Date varchar(255),
Current_Status varchar(255),
Sales_Rep varchar(255),
Customer_Name varchar(255),
Total_Finance varchar(255),
Rate_Class varchar(255),
Supplier_Name varchar(255),
DecisionStatus varchar(255)
)"""
query = """
INSERT INTO [LEAF].[ZZZ] (
Lease_Number,
Start_Date,
Report_Status,
Status_Date,
Current_Status,
Sales_Rep,
Customer_Name,
Total_Finance,
Rate_Class,
Supplier_Name,
DecisionStatus
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
# execute create table
try:
cursor.execute(query1)
conn.commit()
except pyodbc.ProgrammingError:
pass
# grab existing row count in the database for validation later
cursor.execute("SELECT count(*) FROM LEAF.ZZZ")
before_import = cursor.fetchone()
for r in range(1, sheet.nrows):
Lease_Number = sheet.cell(r,0).value
Start_Date = sheet.cell(r,1).value
Report_Status = sheet.cell(r,2).value
Status_Date = sheet.cell(r,3).value
Current_Status= sheet.cell(r,4).value
Sales_Rep = sheet.cell(r,5).value
Customer_Name = sheet.cell(r,6).value
Total_Financed= sheet.cell(r,7).value
Rate_Class = sheet.cell(r,8).value
Supplier_Name = sheet.cell(r,9).value
DecisionStatus= sheet.cell(r,10).value
# Assign values from each row
values = (Lease_Number, Start_Date, Report_Status, Status_Date, Current_Status,
Sales_Rep, Customer_Name, Total_Financed, Rate_Class, Supplier_Name,
DecisionStatus)
# Execute sql Query
cursor.execute(query, values)
# Commit the transaction
conn.commit()
# If you want to check if all rows are imported
cursor.execute("SELECT count(*) FROM LEAF.ZZZ")
result = cursor.fetchone()
print((result[0] - before_import[0]) == len(data.index)) # should be True
# Close the database connection
conn.close()

Categories