I have crawled a webpage in order to crawl certain information like price, header and so on.
Now my goal is to insert the information into a databank. I already set up the databank with the respective fields that are needed.
That is my code:
def trade_spider(max_pages):
Language = "Japanese"
partner = La
location = Tokyo
already_printed = set()
for reg in Region:
count = 0
count1 = 0
page = -1
while page <= max_pages:
page += 1
response = urllib.request.urlopen("http://www.jsox.de/s/search.json?q=" + str(reg) +"&page=" + str(page))
jsondata = json.loads(response.read().decode("utf-8"))
format = (jsondata['activities'])
g_data = format.strip("'<>()[]\"` ").replace('\'', '\"')
soup = BeautifulSoup(g_data)
articles = soup.find_all("article", {"class": "activity-card activity-card-horizontal "})
try:
connection = mysql.connector.connect\
(host = "localhost", user = "root", passwd ="", db = "crawl")
except:
print("No connection to Server")
sys.exit(0)
cursor = connection.cursor()
cursor.execute("DELETE from prices_crawled where Location=" + str(location) + " and Partner=" + str(partner))
connection.commit()
for article in articles:
headers = article.find_all("h3", {"class": "activity"})
for header in headers:
header_initial = header.text.strip()
if header_initial not in already_printed:
already_printed.add(header_initial)
header_final = header_initial
prices = article.find_all("span", {"class": "price"})
for price in prices:
price_end = price.text.strip().replace(",","")[2:]
count1 += 1
if count1 > count:
pass
else:
price_final = price_end
deeplinks = article.find_all("a", {"class": "activity-card"})
for t in set(t.get("href") for t in deeplinks):
deeplink_initial = t
if deeplink_initial not in already_printed:
already_printed.add(deeplink_initial)
deeplink_final = deeplink_initial
cursor.execute('''INSERT INTO prices_crawled (price_id, Header, Price, Deeplink, Partner, Location, Language) \
VALUES(%s, %s, %s, %s, %s, %s, %s)''', ['None'] + [header_final] + [price_final] + [deeplink_final] + [partner] + [location] + [Language])
connection.commit()
cursor.close()
connection.close()
trade_spider(int(Spider))
The issue is that the information do not get into the database. Furthermore, I do not get any error message. Hence, I do not know what I´m doing wrong.
Could you guys help me out? Any feedback is appreciated
Is the delete statement working?
I think the problem is the way you pass your variables
Change your syntax like this:
sql_insert_tx = "INSERT INTO euro_currencies (pk,currency,rate,date) values (null,'USD','%s','%s')" % (usd,date)
cursor.execute(sql_insert_tx)
Related
This question already has answers here:
how to continue for loop after exception?
(2 answers)
Closed 6 months ago.
I have a script which scraping a website every 7 secs and send wp message if statement true. But if any error occur while scraping, script stops. How can I run the code even so any error occur in the script?
import mysql
import mysql.connector
from twilio.rest import Client
import requests
from bs4 import BeautifulSoup
import sched
import time
s = sched.scheduler(time.time, time.sleep)
account_sid = "xxx"
account_token = "xxx"
client = Client(account_sid, account_token)
from_whatsapp_number = "whatsapp:xxx"
to_ali = "whatsapp:xxx"
number = "+xxx"
to_me = "whatsapp:xxx"
tolist = [to_me,to_ali]
# , to_ali, to_yiho, to_eno, to_huso,to_ramo, to_yuno, tohuso1, tohuso2, tohuso3, tohuso4
url = "https://sports2.holiganbet{}.com/tr/spor/yuksek-oran/101/dunya/240/lokasyon".format(
659)
def fetch_data(sc):
URLtest = url
req = requests.get(URLtest)
soupi = BeautifulSoup(req.content, 'html.parser')
container = soupi.find('a', attrs={'class': 'Anchor NavList__Anchor'})
if not container:
print("No match")
elif container:
time.sleep(2)
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="",
database="holi",)
mycursor = mydb.cursor()
URL = url
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
# liste_elemanlari = soup.find_all('li', attrs={'class': 'NavList__Item'})
liste_elemanlari = soup.find('ul', attrs={'class': 'NavList'})
linkelementleri = liste_elemanlari.find_all(
'a', attrs={'class': 'Anchor NavList__Anchor'}, href=True)
for link_element in linkelementleri:
unique_id = link_element['href'].split("/")[-1]
macadi = link_element.text
mycursor.execute(
"SELECT macadi, COUNT(*) FROM maclar WHERE macadi = %s and title = %s GROUP BY macadi", (macadi, macadi))
myresult = mycursor.fetchall()
# gets the number of rows affected by the command executed
row_count = mycursor.rowcount
if row_count == 0:
for person in tolist:
client.messages.create(
body=link_element,
from_=from_whatsapp_number,
to=person
)
time.sleep(1)
sql = "INSERT INTO maclar (id, macadi,title) VALUES (%s, %s, %s)"
val = (unique_id, macadi, macadi)
mycursor.execute(sql, val)
mydb.commit()
if row_count > 0:
print("Maç Mevcut Mesaj Yok")
time.sleep(1)
sc.enter(10, 1, fetch_data, (sc,))
s.enter(10, 1, fetch_data, (s,))
s.run()
Add a try catch block and end error gracefully without rethrowing the error
Exception handling
try :
#yourlogic
except Exception as e:
sleep(2) or pass
ctime = []
name = []
minprice = []
maxprice = []
stock = []
historical_sold = []
sold = []
option_name = []
option_stock = []
option_price = []
#Connect to SQL database
conn = sqlite3.connect('etracker.db')
#Create cursor to work with database
c = conn.cursor()
c.execute('''Create TABLE if not exists server("prices")''')
with open('eurls.csv', 'r') as f:
csv_reader = csv.reader(f)
for row in csv_reader:
asins.append(row[0])
asin = asin
date = datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S")
url = f"https://e.{tld}/api/item/get"
querystring = {"itemid":f"{itemid}","shopid":f"{shopid}"}
payload = ""
headers = {
"cookie": "SPC_SI=9egeYgAAAABlNUJsazZUbPQ60gAAAAAAeVpFRmJWb00%3D; REC_T_ID=c56cc396-9f13-11ec-b054-2cea7fad64d2;",
}
response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
# json object
result_json = response.json()
# starting point
result = result_json['data']
# These are the results from the API based on starting point ['data']
name = result['name']
minprice = result['price_min']/100000
maxprice = result['price_max']/100000
stock = result['normal_stock']
historical_sold = result['historical_sold']
sold = result['sold']
# starting point
option_items = result_json['data']['models']
for option_item in option_items:
# option name
try:
option_name.append(option_item['name'])
except:
option_name.append('')
# option stock
try:
option_stock.append(option_item['normal_stock'])
except:
option_stock.append('')
# option price
try:
option_price.append(option_item['price']/100000)
except:
option_price.append('')
print(option_name, option_stock, option_price)
print(date, name, minprice, maxprice, stock, historical_sold, sold, asin)
c.execute('''INSERT INTO prices VALUES(?,?,?,?,?,?,?,?,?,?,?)''', (date, name, minprice, maxprice, stock, historical_sold, sold, asin))
print(f'Added data for {name}, {minprice}')
#Insert links into table
def data_entry():
for item in option_name:
c.execute("INSERT INTO server(prices) VALUES(?)", (option_name))
#conn.commit()
data_entry() # ==> call the function
conn.commit()
I am getting an error and unable to add those to the SQL database as some products may have 3 options some may have 20 options
How do I handle that? I read that I need to loop through the list and insert it but when I tried it I get this error
sqlite3.ProgrammingError: Incorrect number of bindings supplied. The current statement uses 11, and there are 8 supplied.
Thank you
I'm wring a web scraping program to collect data from truecar.com
my database has 3 columns
and when I run the program I get an error which is this : list indext out of range
here is what I've done so far:
import mysql.connector
from bs4 import BeautifulSoup
import requests
import re
# take the car's name
requested_car_name = input()
# inject the car's name into the URL
my_request = requests.get('https://www.truecar.com/used-cars-for-sale/listings/' +
requested_car_name + '/location-holtsville-ny/?sort[]=best_match')
my_soup = BeautifulSoup(my_request.text, 'html.parser')
# ************ car_model column in database ******************
car_model = my_soup.find_all(
'span', attrs={'class': 'vehicle-header-make-model text-truncate'})
# we have a list of car models
car_list = []
for item in range(20):
# appends car_model to car_list
car_list.append(car_model[item].text)
car_string = ', '.join('?' * len(car_list))
# ************** price column in database *****************************
price = my_soup.find_all(
'div', attrs={'data-test': 'vehicleCardPricingBlockPrice'})
price_list = []
for item in range(20):
# appends price to price_list
price_list.append(price[item].text)
price_string = ', '.join('?' * len(price_list))
# ************** distance column in database ***************************
distance = my_soup.find_all('div', attrs={'data-test': 'vehicleMileage'})
distance_list = []
for item in range(20):
# appends distance to distance_list
distance_list.append(distance[item].text)
distance_string = ', '.join('?' * len(distance_list))
# check the connection
print('CONNECTING ...')
mydb = mysql.connector.connect(
host="xxxxx",
user="xxxxxx",
password="xxxxxx",
port='xxxxxx',
database='xxxxxx'
)
print('CONNECTED')
# checking the connection is done
my_cursor = mydb.cursor(buffered=True)
insert_command = 'INSERT INTO car_name (car_model, price, distance) VALUES (%s, %s, %s);' % (car_string, price_string, distance_string)
# values = (car_string, price_string, distance_string)
my_cursor.execute(insert_command, car_list, price_list, distance_list)
mydb.commit()
print(my_cursor.rowcount, "Record Inserted")
mydb.close()
and I have another problem that I can't insert a list into my columns and I have tried many ways but unfortunately I wasn't able to get it working
I think the problem is in this line:
IndexError Traceback (most recent call last)
<ipython-input-1-4a3930bf0f57> in <module>
23 for item in range(20):
24 # appends car_model to car_list
---> 25 car_list.append(car_model[item].text)
26
27 car_string = ', '.join('?' * len(car_list))
IndexError: list index out of range
I don't want it to insert the whole list to 1 row in database . I want the first 20 car's price, model, mileage in truecar.com in my database
Ya you are hard coding the length. Change how you are iterating through your soup elements. So:
import mysql.connector
from bs4 import BeautifulSoup
import requests
# take the car's name
requested_car_name = input('Enter car name: ')
# inject the car's name into the URL
my_request = requests.get('https://www.truecar.com/used-cars-for-sale/listings/' +
requested_car_name + '/location-holtsville-ny/?sort[]=best_match')
my_soup = BeautifulSoup(my_request.text, 'html.parser')
# ************ car_model column in database ******************
car_model = my_soup.find_all(
'span', attrs={'class': 'vehicle-header-make-model text-truncate'})
# we have a list of car models
car_list = []
for item in car_model:
# appends car_model to car_list
car_list.append(item.text)
# ************** price column in database *****************************
price = my_soup.find_all(
'div', attrs={'data-test': 'vehicleCardPricingBlockPrice'})
price_list = []
for item in price:
# appends price to price_list
price_list.append(item.text)
# ************** distance column in database ***************************
distance = my_soup.find_all('div', attrs={'data-test': 'vehicleMileage'})
distance_list = []
for item in distance:
# appends distance to distance_list
distance_list.append(item.text)
# check the connection
print('CONNECTING ...')
mydb = mysql.connector.connect(
host="xxxxx",
user="xxxxxx",
password="xxxxxx",
port='xxxxxx',
database='xxxxxx'
)
print('CONNECTED')
# checking the connection is done
my_cursor = mydb.cursor(buffered=True)
insert_command = 'INSERT INTO car_name (car_model, price, distance) VALUES (%s, %s, %s)'
values = list(zip(car_list, price_list, distance_list))
my_cursor.executemany(insert_command, values)
mydb.commit()
print(my_cursor.rowcount, "Record Inserted")
mydb.close()
ALTERNATE:
there's also the API where you can fetch the dat:
import mysql.connector
import requests
import math
# take the car's name
requested_car_name = input('Enter car name: ')
# inject the car's name into the URL
url = 'https://www.truecar.com/abp/api/vehicles/used/listings'
payload = {
'city': 'holtsville',
'collapse': 'true',
'fallback': 'true',
'include_incentives': 'true',
'include_targeted_incentives': 'true',
'make_slug': requested_car_name,
'new_or_used': 'u',
'per_page': '30',
'postal_code': '',
'search_event': 'true',
'sort[]': 'best_match',
'sponsored': 'true',
'state': 'ny',
'page':'1'}
jsonData = requests.get(url, params=payload).json()
total = jsonData['total']
total_pages = math.ceil(total/30)
total_pages_input = input('There are %s pages to iterate.\nEnter the number of pages to go through or type ALL: ' %total_pages)
if total_pages_input.upper() == 'ALL':
total_pages = total_pages
else:
total_pages = int(total_pages_input)
values = []
for page in range(1,total_pages+1):
if page == 1:
car_listings = jsonData['listings']
else:
payload.update({'page':'%s' %page})
jsonData = requests.get(url, params=payload).json()
car_listings = jsonData['listings']
for listing in car_listings:
vehicle = listing['vehicle']
ex_color = vehicle['exterior_color']
in_color = vehicle['interior_color']
location = vehicle['location']
price = vehicle['list_price']
make = vehicle['make']
model = vehicle['model']
mileage = vehicle['mileage']
style = vehicle['style']
year = vehicle['year']
engine = vehicle['engine']
accidentCount = vehicle['condition_history']['accidentCount']
ownerCount = vehicle['condition_history']['ownerCount']
isCleanTitle = vehicle['condition_history']['titleInfo']['isCleanTitle']
isFrameDamaged = vehicle['condition_history']['titleInfo']['isFrameDamaged']
isLemon = vehicle['condition_history']['titleInfo']['isLemon']
isSalvage = vehicle['condition_history']['titleInfo']['isSalvage']
isTheftRecovered = vehicle['condition_history']['titleInfo']['isTheftRecovered']
values.append((ex_color, in_color,location,price,make,model,mileage,
style,year,engine,accidentCount,ownerCount,isCleanTitle,isFrameDamaged,
isLemon, isSalvage,isTheftRecovered))
print('Completed: Page %s of %s' %(page,total_pages))
# check the connection
print('CONNECTING ...')
mydb = mysql.connector.connect(
host="xxxxx",
user="xxxxxx",
password="xxxxxx",
port='xxxxxx',
database='xxxxxx'
)
print('CONNECTED')
# checking the connection is done
my_cursor = mydb.cursor(buffered=True)
# create_command = ''' create table car_information (exterior_color varchar(255), interior_color varchar(255),location varchar(255),price varchar(255),make varchar(255),model varchar(255),mileage varchar(255),
# style varchar(255),year varchar(255),engine varchar(255),accidentCount varchar(255),ownerCount varchar(255),isCleanTitle varchar(255),isFrameDamaged varchar(255),
# isLemon varchar(255), isSalvage varchar(255),isTheftRecovered varchar(255))'''
# my_cursor.execute(create_command)
# print('created')
insert_command = '''INSERT INTO car_name (exterior_color, interior_color,location,price,make,model,mileage,
style,year,engine,accidentCount,ownerCount,isCleanTitle,isFrameDamaged,
isLemon, isSalvage,isTheftRecovered) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
my_cursor.executemany(insert_command, values)
mydb.commit()
print(my_cursor.rowcount, "Record Inserted")
mydb.close()
the problem seems to be that the list of car models has less than 20 entries.
for item in range(20):
car_list.append(car_model[item].text)
this always tries to append exactly 20 items to the car list. if you have less than 20 entries, there is an error, because car_model[20].text does not exist when there are only 10 entries. you can try
for item in range(len(car_model)):
car_list.append(car_model[item].text)
Although I am getting over 10 items as results in Python, right now I am only able to get the last product to appear in my MySQL database (with an id of 12 along with its information like price, picture, etc). I need to fix it so that they all appear and not just one product.
Python code is below.
import requests
from bs4 import BeautifulSoup
import mysql.connector
url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
conn = mysql.connector.connect(host='127.0.0.1', user='x', database='scrape',password="x")
cursor = conn.cursor()
item_container = soup.find_all('div', class_='item-container')
def get_data():
lists = []
for index, item_name in enumerate(item_container):
name = item_name.find_all('a', class_='item-title')[0].text
lists.append({'name': name})
lists[index]['index'] = index
for index, item_price in enumerate(item_container):
price = item_price.find('li', class_='price-current').find('strong')
if price == None:
price == ('Not Available')
lists[index]['price'] = price
else:
price = ('$' + price.text +'.99')
prices = []
lists[index]['price'] = price
for index, item_picture in enumerate(item_container):
picture = 'http:' + item_picture.find('img', class_='lazy-img')['data-src']
lists[index]['picture'] = picture
for index, item_shipping in enumerate(item_container):
shipping = (item_shipping.find('li', class_='price-ship').text).strip()
lists[index]['shipping'] = shipping
def create_table():
val_index = lists[index]['index']
val_name = lists[index]['name']
val_picture = lists[index]['picture']
val_price = lists[index]['price']
val_shipping = lists[index]['shipping']
add_item = ("INSERT INTO newegg "
"(id, itemname, itempic, itemprice, itemshipping) "
"VALUES (%s, %s, %s, %s, %s)")
data_item = (val_index, val_name, val_picture, val_price, val_shipping)
cursor.execute("DELETE FROM newegg ")
conn.commit()
cursor.execute(add_item, data_item)
conn.commit()
cursor.close()
conn.close()
create_table();
get_data()
So the main thing that needs fixing is create_table(). We don't want it to be deleting the database contents right before inserting an item. Also, we need to loop over all of the items in your lists. I would do that this way.
def create_table():
cursor.execute("DELETE FROM newegg ")
conn.commit()
for product in lists:
val_index = product['index']
val_name = product['name']
val_picture = product['picture']
val_price = product['price']
val_shipping = product['shipping']
add_item = ("INSERT INTO newegg "
"(id, itemname, itempic, itemprice, itemshipping) "
"VALUES (%s, %s, %s, %s, %s)")
data_item = (val_index, val_name, val_picture, val_price, val_shipping)
cursor.execute(add_item, data_item)
conn.commit()
Notice, create_table() also no longer closes the connection for you. I would recommend closing the connection in the same scope where you initialized it (in this case, the global scope). Function create_table() doesn't "own" the connection resource so it should not be allowed to destroy it. Though it would make perfect sense to both initialize and destroy the connection inside of the function.
Also, note that this will clear out your table every time you do the scraping. This might be fine, but if you want to change your ids over time, don't delete at the beginning, and get your id column to auto increment or something.
I'm new to Python (learnt how to code with it in 2 days ago). I'm trying to get feeds from MySQL database and insert theme into other table. But nothing inserted.
Here is my code:
cnx = MySQLConnection(**db_config)
if cnx.is_connected():
print("Database connected successfully...")
cursor = cnx.cursor(dictionary=True)
cursor.execute("SELECT * from external_feeds WHERE discipline = 'ALL' AND actif = 1")
rows = cursor.fetchall()
insert_feed = ("INSERT INTO feeds "
"(categorie, urlflux, titreflux, photonews, textnews, date, titrenews, liensnews, slug, photo)"
"VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
for row in rows:
feed = feedparser.parse(row["url"])
feed_link = row["url"]
name = row["name"]
image = row["photo"]
category = row["discipline"]
x = len(feed.entries)
for i in range(x):
feed_title = feed.entries[i].title
print feed_title
feed_url = feed.entries[i].link
print feed_url
feed_published = feed.entries[i].published
dPubPretty = strftime(feed_published, gmtime())
feed_description = feed.entries[i].description
slug = re.sub('[^a-zA-Z0-9 \n\-]', '', feed_url)
slug = slug.replace('httpwww', '')
slug = slug.replace('http', '')
# print insert_feed
data_feed = (category, feed_link, name, None, feed_description, dPubPretty, feed_title, feed_url, slug, image)
try:
cursor.execute(insert_feed, data_feed)
cursor.commit()
except:
cnx.rollback()
cursor.close()
Is there anyone who can help me figure out where the problem is? I am completly new to this so I'm totally lost
I see that you are performing 'cursor.commit()' after inserting the data, which is incorrect, try using 'cnx.commit()'.