I use the following code to insert a code in a search bar, click a button and finally extract some information:
from selenium import webdriver
import time
from fake_useragent import UserAgent
url = 'https://www.ufficiocamerale.it/'
vat = '06655971007'
useragent = UserAgent()
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", useragent.random)
driver = webdriver.Firefox(profile)
driver.get(url)
time.sleep(5)
item = driver.find_element_by_xpath('//form[#id="formRicercaAzienda"]//input[#id="search_input"]')
item.send_keys(vat)
time.sleep(1)
button = driver.find_element_by_xpath('//form[#id="formRicercaAzienda"]//p//button[#type="submit"]')
button.click()
time.sleep(5)
all_items = driver.find_elements_by_xpath('//ul[#id="first-group"]/li')
for item in all_items:
if '#' in item.text:
print(item.text.split(' ')[1])
driver.close()
Now I would like to modify the code to handle the above process several times thanks to a for loop, i.e. something like this:
from selenium import webdriver
import time
from fake_useragent import UserAgent
url = 'https://www.ufficiocamerale.it/'
vats = ['06655971007', '06655971007', '01010101010']
for vat in vats:
useragent = UserAgent()
# rest of the code
but it does nothing. Where am I doing wrong? Is it the definition of user agent?
Can you be more specific about "it does nothing" please ?
Is code without loop is working fine ?
*when testing it on this web site, for "06655971007" as input, it won't write anything because no # in returned string
EDIT
from selenium import webdriver
import time
#from fake_useragent import UserAgent
url = 'https://www.ufficiocamerale.it/'
vats = ['06655971007', '06655971007', '01010101010']
for vat in vats:
#useragent = UserAgent()
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", "useragent.random")
driver = webdriver.Chrome('./chromedriver.exe')
#driver = webdriver.Firefox(profile)
driver.get(url)
time.sleep(5)
item = driver.find_element_by_xpath('//form[#id="formRicercaAzienda"]//input[#id="search_input"]')
item.send_keys(vat)
time.sleep(1)
button = driver.find_element_by_xpath('//form[#id="formRicercaAzienda"]//p//button[#type="submit"]')
button.click()
time.sleep(5)
all_items = driver.find_elements_by_xpath('//ul[#id="first-group"]/li')
found_it = False
for item in all_items:
if '#' in item.text:
print(vat + " = " + item.text.split(' ')[1])
found_it = True
if not found_it:
print(vat + " no email found")
driver.close()
With output like this :
01010101010 no email found
08157270961 = vince.srl#legalmail.it
06655971007 = enelenergia#pec.enel.it
Related
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import chromedriver_autoinstaller
from selenium.webdriver.common.keys import Keys
import subprocess
import time
from selenium.webdriver.common.by import By
import functionmodules
### PATH
CHROME_DRIVER = 'C:\\Users\SANGHYUN\Downloads\chromedriver_win32\chromedriver.exe'
url = 'https://cafe.naver.com/reply14/1'
#url = 'https://cafe.naver.com/reply14'
CHROME_PATH = 'C:\\Program Files\Google\Chrome\Application\chrome.exe'
searchpath = url
subprocess.Popen(r'C:\\Program Files\Google\Chrome\Application\chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\chrometemp"')
option = Options()
option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_ver = chromedriver_autoinstaller.get_chrome_version().split('.')[0]
try:
driver = webdriver.Chrome(f'./{chrome_ver}/chromedriver.exe', options=option)
except:
chromedriver_autoinstaller.install(True)
driver = webdriver.Chrome(f'./{chrome_ver}/chromedriver.exe', options=option)
driver.get(searchpath)
def CallGoToArticleStep():
# go to main
driver.switch_to.parent_frame()
driver.find_element(By.XPATH, '//*[#id="menuLink1"]').click()
driver.switch_to.frame('cafe_main')
# click article3
time.sleep(2)
firstarticle = '//*[#id="main-area"]/div[4]/table/tbody/tr[2]/td[1]/div[3]/div/a[1]'
element3 = driver.find_element(By.XPATH, firstarticle)
element3.send_keys('\n')
#CallGoToArticleStep()
# write reply, send reply
for i in range (1):
time.sleep(4)
print (i)
replyString = '//*[#id="app"]/div/div/div[2]/div[2]/div[4]/div[2]/div[1]/textarea'
replyElement = driver.find_element(By.XPATH, replyString)
replyElement.send_keys('whisky life')
replyClickString = '//*[#id="app"]/div/div/div[2]/div[2]/div[4]/div[2]/div[2]/div[2]/a'
replyClickElement = driver.find_element(By.XPATH, replyClickString)
replyClickElement.click()
time.sleep(1000)`
In this source call CallGoToArticleStep() can get replyElement, not call then can't get replyElement but, browser element equal.
is there way to not call CallGoToArticleStep function and get replyElement?
I'm working with python, selenium. I'm typing a keyword which is then being searched on google. In the results section, I am trying to open the URLs one by one and storing the data of the p tag.
But in my script, it is storing data of only one site. Can anyone help on this to store the data of p tag of all the opened sites?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
"""
Taking input from user
"""
search_input = input("Input the keyword you want to search for:")
search_input = search_input.replace(' ', '+')
driver = webdriver.Chrome(executable_path="E:\chromedriver\chromedriver.exe")
for i in range(1):
matched_elements = driver.get("https://www.google.com/search?q=" +
search_input + "&start=" + str(i))
print(driver.title)
driver.maximize_window()
time.sleep(5)
links_url = driver.find_elements_by_xpath("//div[#class='yuRUbf']/a[#href]")
links = []
for x in links_url:
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:
print('new url : ', new_url)
driver.get(new_url)
link_data.append(driver.page_source)
"""
Getting the data from the site
"""
content = driver.find_elements(By.TAG_NAME, "p")
for data in content:
print(data.text)
driver.back()
driver.close()
Here is the edited answer, first I misunderstood your question:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
"""
Taking input from user
"""
search_input = input("Input the keyword you want to search for:")
search_input = search_input.replace(' ', '+')
driver = webdriver.Chrome(executable_path="E:\chromedriver\chromedriver.exe")
for i in range(1):
matched_elements = driver.get("https://www.google.com/search?q=" +
search_input + "&start=" + str(i))
print(driver.title)
driver.maximize_window()
time.sleep(5)
links_url = driver.find_elements_by_xpath("//div[#class='yuRUbf']/a[#href]")
links = []
for x in links_url:
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:
print('\nnew url : ', new_url)
driver.get(new_url)
#Getting the data from the site
try:
link = driver.find_elements(By.TAG_NAME, "p")
for p in link:
print(p.get_attribute("innerText"))
except:
continue
driver.quit()
My current code is:
import time
import numpy as np
import requests
from bs4 import BeautifulSoup as soup
from selenium import webdriver
first_page_url = 'https://store.steampowered.com/tags/en/Action/#p=0&tab=NewReleases'
first_url = 'https://store.steampowered.com/tags/en/Action/#p='
rest_url = '&tab=NewReleases'
driver = webdriver.Chrome()
driver.get(first_page_url)
soup_page = driver.page_source
print(type(soup_page))
soup_page = soup(driver.page_source,'lxml')
page_numbers = soup_page.find('div',{'id':'NewReleases_ctn'})
page_numbers = page_numbers.text.split()
#print(page_numbers)
last_page_number = page_numbers[13]
print(last_page_number)
last_page_number = last_page_number.replace(',','')
last_page_number = int(last_page_number)
last_page_number = last_page_number / 15
print(last_page_number)
last_page_number = round(last_page_number)
pages = range(0,last_page_number)
index_number = 0
nana = 1
for page in pages:
page = first_url + str(pages[index_number]) + rest_url
print(type(page))
driver = webdriver.Chrome()
driver.get(page)
time.sleep(5)
soup_page = driver.page_source
#print(type(soup_page))
soup_page = soup(driver.page_source,'lxml')
new_releases = soup_page.find('div',{'id':'NewReleasesRows'})
containers = new_releases.find_all('a',{'class':'tab_item'})
container = containers[0]
driver.close()
for container in containers:
title = container.find('div',{'class':'tab_item_name'}).text
print(nana,title)
nana = nana + 1
index_number =index_number + 1
The code works as intended except, that every time it loops through the first for loop, it opens the driver.Chrome().
Is there a way of using the opened browser, again and again, other than opening a new one every time it loops through?
driver object is created global and you are again invoking chrome in commented line. So remove that line and all pages will work in same chrome.
for page in pages:
page = first_url + str(pages[index_number]) + rest_url
print(type(page))
// driver = webdriver.Chrome()
driver.get(page)
Would loop all 71 pages instead. Without the need for grabbing any page number or etc.
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(first_page_url)
while True:
try:
wait.until(EC.element_to_be_clickable((By.XPATH,"//span[not(#class='pagebtn disabled')and #id='NewReleases_btn_next']"))).click()
except:
break
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
this is the form link: https://docs.google.com/forms/d/e/1FAIpQLSe61r6TNx4JvRg2gVu3Eu8-KYKCvd1dJCAmYJFnNw4EU9llMw/viewform this is my code i want to select DHAOUI MOHAMED AZIZ what am i missing :
import time
from selenium import webdriver
def sleep():
time.sleep(5)
Chrome = webdriver.Chrome(executable_path='C:/Users/dhaou/Desktop/chromedriver.exe')
url = "https://docs.google.com/forms/d/e/1FAIpQLSe61r6TNx4JvRg2gVu3Eu8-KYKCvd1dJCAmYJFnNw4EU9llMw/viewform"
Chrome.get(url)
first_div = Chrome.find_element_by_xpath('//*[#id="mG61Hd"]/div[2]/div/div[2]/div[1]/div/div/div[2]/div')
first_div.click()
sleep()
second=Chrome.find_element_by_xpath('//*[#id="mG61Hd"]/div[2]/div/div[2]/div[1]/div/div/div[2]/div/div[1]/div[1]/div[16]')
second.click()
This seems to work for me for the url you had provided.
PATH = "./chromedriver"
driver = webdriver.Chrome(PATH)
driver.implicitly_wait(5)
url = "https://docs.google.com/forms/d/e/1FAIpQLSe61r6TNx4JvRg2gVu3Eu8-KYKCvd1dJCAmYJFnNw4EU9llMw/viewform"
driver.get(url)
element = ".quantumWizMenuPaperselectOption.appsMaterialWizMenuPaperselectOption.freebirdThemedSelectOptionDarkerDisabled.exportOption.isSelected.isPlaceholder"
dropdown = driver.find_element_by_css_selector(element)
dropdown.click()
name = "ALI GHANMI"
list_element = "//div[#class='exportSelectPopup quantumWizMenuPaperselectPopup appsMaterialWizMenuPaperselectPopup']//span[text()='"+name+"']"
dropdown_element = driver.find_element_by_xpath(list_element)
dropdown_element.click()
By the way, The form in the URL you had provided does not have DHAOUI MOHAMED AZIZ. I tried a different name.
This code will crawl again from the beginning every time an error occurs. I want to change this to crawl only new text, not just from the beginning.
and I would like to ask for further advice.
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup, Comment
import pandas as pd
#Setting up Chrome webdriver Options
#chrome_options = webdriver.ChromeOptions()
#setting up local path of chrome binary file
#chrome_options.binary_location = "/Users/Norefly/chromedriver2/chromedriver.exec"
#creating Chrome webdriver instance with the set chrome_options
driver = webdriver.PhantomJS("C:/Python/phantomjs-2.1.1-windows/bin/phantomjs.exe")
link = "https://play.google.com/store/apps/details?id=com.supercell.clashofclans&hl=en"
driver.get(link)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
Ptitle = driver.find_element_by_class_name('id-app-title').text.replace(' ','')
print(Ptitle)
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]').click()
sleep(1)
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#select_newest.select_by_visible_text('Newest')
#driver.find_element_by_xpath('//*[#id="body- content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
sleep(2)
#driver.find_element_by_css_selector('.review-filter.id-review-sort-filter.dropdown-menu-container').click()
driver.find_element_by_css_selector('.displayed-child').click()
#driver.find_element_by_xpath("//button[#data-dropdown-value='1']").click()
driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
reviews_df = []
for i in range(1,10):
try:
for elem in driver.find_elements_by_class_name('single-review'):
print(str(i))
content = elem.get_attribute('outerHTML')
soup = BeautifulSoup(content, "html.parser")
#print(soup.prettify())
date = soup.find('span',class_='review-date').get_text()
rating = soup.find('div',class_='tiny-star')['aria-label'][6:7]
title = soup.find('span',class_='review-title').get_text()
txt = soup.find('div',class_='review-body').get_text().replace('Full Review','')[len(title)+1:]
print(soup.get_text())
temp = pd.DataFrame({'Date':date,'Rating':rating,'Review Title':title,'Review Text':txt},index=[0])
print('-'*10)
reviews_df.append(temp)
#print(elem)
except:
print('what i can do?')
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
reviews_df = pd.concat(reviews_df,ignore_index=True)
reviews_df.to_csv(Ptitle+'review_google.csv', encoding='utf-8')
driver.close()
And I wonder if this is a problem with phantom js