I am scraping a website and inserting data in a MySQL DB at the same time. Something like this.I had to delete the scraping portions or the code would be too big.
def get_page(links):
parent_window = driver.current_window_handle
for link in links:
driver.execute_script('window.open(arguments[0]);', link)
all_windows = driver.window_handles
child_window = [window for window in all_windows if window != parent_window][0]
driver.switch_to.window(child_window)
#scraping
try:
cursor.execute("INSERT INTO Investors(name, tags, website, introduction) VALUES(%s,%s,%s,%s)", (name,tag,website,introduction,))
except Exception as e:
raise e
parent_window1 = driver.current_window_handle
for lin in team_div:
driver.execute_script('window.open(arguments[0]);', lin)
all_windows = driver.window_handles
child_window1 = [window for window in all_windows if window != parent_window1][1]
driver.switch_to.window(child_window1)
time.sleep(2)
#scraping
driver.close()
driver.switch_to.window(parent_window1)
sql = cursor.execute(f"SELECT inv_id FROM Investors WHERE name =\'{name}\'")
pid = cursor.fetchone()
try:
cursor.execute("INSERT INTO team_members(inv_id,mem_name, picture, experience) VALUES(%s,%s,%s,%s)", (pid,port_name,headshot, work_ex,))
except:
pass
driver.refresh()
time.sleep(3)
driver.execute_script("window.scrollBy(0,2825)", "")
time.sleep(2)
#scraping
try:
cursor.execute("INSERT INTO portfolio(inv_id,port_name, port_icon, port_desc) VALUES(%s,%s,%s,%s)", (pid1,p_name, p_icon, p_short_des,))
except:
pass
driver.close()
driver.switch_to.window(parent_window)
def get_links(page):
if page == 1:
url = 'https://www.cypherhunter.com/en/search/?q=investments'
driver.get(url)
time.sleep(2)
links = driver.find_elements_by_xpath('//div[#class="app-item-container"]//a')
return links
else:
url = f'https://www.cypherhunter.com/en/search/page/{page}/?q=investments'
driver.get(url)
time.sleep(2)
links = driver.find_elements_by_xpath('//div[#class="app-item-container"]//a')
return links
for p in range(1, 48):
z = get_links(p)
get_page(z)
I have a sense that maybe this is a inefficient way of sending data, but then it would become two questions. My question is how can I make it so that if the script fails for some reason..it starts from the same place on the next run. The last index we can get from the MySQL but how to do it in the code. Only manually?
Related
I was able to scrape the first page of eBay sold items, so I attempted the pagination, here's what I have:
ebay_url = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=oakley+sunglasses&_sacat=0&Brand=Oakley&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=200&_oaa=1&_fsrp=1&_dcat=79720'
# Load in html
html = requests.get(ebay_url)
# print(html.text)
driver = wd.Chrome(executable_path=r'/Users/mburley/Downloads/chromedriver')
driver.maximize_window() #Maximizes window
driver.implicitly_wait(30) # Gives an implicit wait for 30 seconds
driver.get(ebay_url)
wait = WebDriverWait(driver, 20) # Makes driver wait 20 seconds
sold_date = []
title = []
price = []
i = 1
## Loop here to get multiple pages
next_page = True
while next_page:
try:
# for item in all_items
for item in driver.find_elements(By.XPATH, "//div[contains(#class,'title--
tagblock')]/span[#class='POSITIVE']"):
try:
# Get Sale Date of item and update 'data'
sold_date.append(item.text)
except NoSuchElementException:
# Element not found
sold_date.append(None)
try:
# Get title of each item and update 'data'
title.append(driver.find_element_by_xpath(f".
(//div[contains(#class,'title--tagblock')]/span[#class='POSITIVE']/ancestor::div[contains(#class,'tag')]/following-sibling::a/h3)[{i}]").text)
except NoSuchElementException:
# Element not found
title.append(None)
try:
# Get price of each item and update 'data'
price.append(item.find_element_by_xpath(f"(//div[contains(#class,'title--tagblock')]/span[#class='POSITIVE']/ancestor::div[contains(#class,'tag')]/following-sibling::div[contains(#class,'details')]/descendant::span[#class='POSITIVE'])[{i}]").text)
except NoSuchElementException:
# Element not found
price.append(None)
i = i + 1
# Print results of scraped data on page
print(sold_date)
print(title)
print(price)
data = {
'Sold_date': sold_date,
'title': title,
'price': price
}
# Load Next Page by clicking button
button = driver.find_element_by_name('pagination__next icon-link')
button.click()
print("Clicked on Next Page!")
time.sleep(1)
except:
print("Done!")
next_page = False
df = pd.DataFrame.from_dict(data)
df.to_csv('out_two.csv', index = 0)
After I had the code to scrape page 1, I added:
... code ...
## Loop here to get multiple pages
next_page = True
while next_page:
try:
... code to scrape page 1 ...
# Load Next Page by clicking button
button = driver.find_element_by_name('pagination__next icon-link')
button.click()
print("Clicked on Next Page!")
time.sleep(1)
except:
print("Done!")
next_page = False
Which unfortunately edits the code to scrape the first item, then searches for the next page, and can't find the "button" so it exits and prints done. I don't know a lot about scraping, so I tried to follow an online example. Can anyone help? Thanks!
I am trying to scrape youtube comments so that each row contains the title of the video, author of comment, and comment itself. As seen in the code below I open the drive successfully and get rid of some authentication and cookie messages as well. Scroll enough to get the first comments loaded. After this happens I still am not able to get the comment text by xpath as seen below.
csv_file = open('funda_youtube_comments.csv', 'w', encoding="UTF-8", newline="")
writer = csv.writer(csv_file)
writer.writerow(['title', 'comment', 'author'])
PATH = r"C:\Users\veiza\OneDrive\Desktop\AUAS\University\Quarter 2\Online Data Mining\Project1test\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.implicitly_wait(10)
driver.get("https://www.youtube.com/watch?v=VWQaP9txG6M&t=76s")
driver.maximize_window()
time.sleep(2)
driver.execute_script('window.scrollTo(0,700);')
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[#id='dismiss-button']"))).click()
time.sleep(2)
WebDriverWait(driver,10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[src^='https://consent.google.com']")))
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[#id='introAgreeButton']"))).click()
time.sleep(2)
title = driver.title
print(title)
time.sleep(5)
totalcomments= len(driver.find_elements_by_xpath("""//*[#id="content-text"]"""))
if totalcomments < 50:
index = totalcomments
else:
index = 50
youtube_dict ={}
ccount = 0
while ccount < index:
try:
comment = driver.find_elements_by_xpath('//*[#id="content-text"]')[ccount].text
except:
comment = ""
try:
authors = driver.find_elements_by_xpath('//a[#id="author-text"]/span')[ccount].text
except:
authors = ""
try:
title = title
except:
title = ""
youtube_dict['comment'] = comment
youtube_dict['author'] = authors
youtube_dict['video title'] = title
writer.writerow(youtube_dict.values())
ccount = ccount + 1
print(youtube_dict)
driver.close()
What am I doing wrong?
If you want to make it simple, you can use tube_dl
pip install tube_dl
This module has Comments class that can help you with processing comments.
Here's the simple usage of that:
from tube_dl.comments import Comments
comments = Comments('yt url').process_comments()
#If you want limited comments, you can specify that. Ex : process_comments(count=45)
Feel free to raise issues at github.com/shekharchander/tube_dl. I'll be happy to resolve issues.
I was able to scrape youtube comments. below you can see the solution.
options = Options()
options.add_argument("--headless")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
PATH = r"C:\Users\veiza\OneDrive\Desktop\AUAS\University\Quarter 2\Online Data " \
r"Mining\Project1test\chromedriver.exe "
driver = webdriver.Chrome(executable_path=PATH, options=options)
driver.get(response.url)
time.sleep(5)
try:
title = driver.find_element_by_xpath('//*[#id="container"]/h1/yt-formatted-string').text
comment_section = driver.find_element_by_xpath('//*[#id="comments"]')
except exceptions.NoSuchElementException:
error = "Error: Double check selector OR "
error += "element may not yet be on the screen at the time of the find operation"
print(error)
driver.execute_script("arguments[0].scrollIntoView();", comment_section)
time.sleep(7)
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
break
last_height = new_height
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
try:
accounts_elems = driver.find_elements_by_xpath('//*[#id="author-text"]')
comment_elems = driver.find_elements_by_xpath('//*[#id="content-text"]')
except exceptions.NoSuchElementException:
error = "Error: Double check selector OR "
error += "element may not yet be on the screen at the time of the find operation"
print(error)
accounts = [elem.text for elem in accounts_elems]
comments = [elem.text for elem in comment_elems]
for comment_index in range(len(comment_elems)):
yield {
'title': title,
'url': driver.current_url,
'account': accounts[comment_index],
'comment': comments[comment_index]
}
The problem is that it dosen't like the posts.
I have tried difrend methods like tag name
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
def like_photo(self):
driver = self.driver
driver.get("https://www.instagram.com")
time.sleep(1)
for i in range(1, 4):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# find all the heart links
hrefs = driver.find_elements_by_xpath("//span[#aria-label='Synes godt om']")
pic_hrefs = [elem.get_attribute('href') for elem in hrefs]
pic_hrefs = [href for href in pic_hrefs]
print(' Photos ' + str(len(pic_hrefs)))
for _ in pic_hrefs:
driver.get("https://www.instagram.com")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
like_button = lambda: driver.find_elements_by_xpath("//span[#aria-label='Synes godt om']")
like_button.click()
time.sleep(18)
except Exception as e:
time.sleep(1)
nameIG = InstagramBot(username, password)
nameIG.login()
nameIG.like_photo()
It dosent like any post the output is just: Photos 4
Process finished with exit code 0
exit code 0 means your code is running with no error. However, there's still a problem.
To see if there are actual errors in your code, change the exception actions.
except Exception as e:
print(e) # shows actual error
Try this:
like_buttons = driver.find_elements_by_xpath(some_xpath_to_buttons) # list of WebElements
for button in like_buttons:
button.click()
time.sleep(18)
I am trying to automate this Instagram link. I need to scroll and scroll and fetch all links. I am trying following but not working.
def fetch_links_by_hashtag(hash_tag):
url = 'https://www.instagram.com/explore/tags/marketing/'
driver.get(url)
driver.implicitly_wait(20)
is_more = False
try:
elem_more = wait.until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "Load more")))
elem_more.click()
is_more = True
except Exception as ex:
print(str(ex))
pop = driver.find_element_by_tag_name('footer')
#pop = driver.find_element_by_link_text('About us')
# pop = driver.find_element_by_class_name('_4gt3b')
if pop is not None:
for i in range(10):
print('Calling scrolling script')
# It scolls till end
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', pop)
sleep(4)
html = pop.get_attribute('innerHTML')
print(html)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
How to scroll down to the bottom of a page ?
In addition to 宏杰李 answer
driver.execute_script("return arguments[0].scrollIntoView();", element_obj)
Also, if you want to make an extra scroll:
driver.execute_script("return arguments[0].parentNode.scrollTop = "
"arguments[0].parentNode.scrollTop + {extra_scroll}"
.format(extra_scroll=extra_scroll_pixels), element_obj)
My entire code:
def _scroll_to_element(driver, element,
extra_scroll=None):
# Scroll to element
driver.execute_script("return arguments[0].scrollIntoView();", element)
# Scroll parentNode with the extra pixels (If provided)
if extra_scroll:
driver.execute_script(
"return arguments[0].parentNode.scrollTop = "
"arguments[0].parentNode.scrollTop + {extra_scroll}".format(
extra_scroll=str(extra_scroll)), element)
I am trying to grab information from tripadvisor. I sometimes get
Message: stale element reference: element is not attached to the page document
(Session info: chrome=47.0.2526.73)
(Driver info: chromedriver=2.20.353124 (035346203162d32c80f1dce587c8154a1efa0c3b),platform=Mac OS X 10.10.4 x86_64)
and then the element is just whatever I assign it to. How can I fix my code to handle the issue and then figure out a solution to it instead of re running the code?
def getElements(driver):
elements = []
for dd in driver.find_elements_by_xpath("//*[contains(#class, 'ui_button original')]"):
try:
if dd.text == "Book Now":
elements.append(dd)
except Exception as ee:
print ee
return elements
def getBookingPartner(driver, ibInfo):
data = []
i = 0
elements = []
time.sleep(2)
elements = getElements(driver)
elementCounter = 0
while(elements == [] or elementCounter >5):
elements = getElements(driver)
elementCounter+=1
print "Length of elements should be > 0 : " + str(len(elements))
for ii in ibInfo:
if ii[0] == "Yes":
driver.implicitly_wait(3)
bookingPartner = "Error"
print ii
driver.implicitly_wait(3)
try:
elements[i].click()
driver.implicitly_wait(3)
driver.switch_to_window(driver.window_handles[-1])
except Exception as ee:
try:
driver.refresh()
getElements(driver)[i].click()
time.sleep(1)
driver.switch_to_window(driver.window_handles[-1])
except Exception as ee:
print "Stale Exception...."
print ee
try:
driver.implicitly_wait(3)
driver.find_elements_by_xpath("//*[contains(#class, 'book_now')]")[1].click()
driver.implicitly_wait(1)
page = etree.HTML(driver.page_source)
bookingPartner = page.xpath("//div[contains(#class, 'custServiceMsg')]//text()")[0].split("will")[0].strip()
except:
try:
time.sleep(3)
driver.find_elements_by_xpath("//*[contains(#class, 'book_now')]")[1].click()
time.sleep(2)
page = etree.HTML(driver.page_source)
bookingPartner = page.xpath("//div[contains(#class, 'custServiceMsg')]//text()")[0].split("will")[0].strip()
except:
try:
bookingPartner = page.xpath("//div[contains(#class, 'custServiceMsg')]//text()")[1].split("will")[0].strip()
except Exception as ee:
bookingPartner = "Error"
print "error"
i+=1
if bookingPartner == "The remainder":
bookingPartner = page.xpath("//div[contains(#class, 'custServiceMsg')]//text()")[1].split("will")[0].strip()
if len(driver.window_handles) > 1:
driver.close()
driver.switch_to_window(driver.window_handles[0])
print bookingPartner
data.append([ii[0], ii[1], bookingPartner])
else:
data.append([ii[0], ii[1], "N/A"])
ii.extend(["N/A"])
print data
return data
A Stale Element Reference Exception occurs when an element:
Has been deleted
Is no longer attached to the DOM (as in your case)
Has changed
From the docs:
You should discard the current reference you hold and replace it, possibly by locating the element again once it is attached to the DOM.
i.e.: "Find" the element again.
You'll need to modify the code to catch this error for the appropriate step.
from selenium.common.exceptions import StaleElementReferenceException
elem = driver.find_element_by_xpath('something leaves dom')
# ... do other actions which change the page and then later...
try:
elem.click()
except StaleElementReferenceException:
elem = driver.find_element_by_xpath('something leaves dom')
elem.click()
Make a re-usable a version if you need it extensively for several elements.
Btw, you should not be catching Exception in your code. Be specific about which ones you want to handle.