Characters not decoded properly using Jsoup and PhantomJS - python

This is the thing, I'm using PhantomJS and Selenium in Python to render pages, this is the code:
import sys, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
path_to_chromedriver = 'C:\\..\\chromedriver'
section = sys.argv[1]
path = sys.argv[2]
links = sys.argv[3]
listOfLinks = []
file = open(links, 'r')
for link in file:
listOfLinks.append(link)
dr = webdriver.Chrome(executable_path = path_to_chromedriver)
cont = 0
for link in listOfLinks:
try:
dr.get(link)
# Wait.
element = WebDriverWait(dr, 20).until(
EC.presence_of_element_located((By.CLASS_NAME, "_img-zoom"))
)
time.sleep(1)
htmlPath = path + section + "_" + str(cont) + ".html"
# Write HTML.
file = open(htmlPath, 'w')
file.write(dr.page_source)
file.close()
cont = cont + 1
except:
print("Exception")
dr.quit()
This code creates a HTML of the links received as parameter.
This file is parsed by Jsoup in Java:
Document document = Jsoup.parse( file, "UTF-8" );
However, special characters as '€', 'á', 'é', 'í', etc are not decoded properly and they're being replaced by '?'. How can I solve this?

Solution found by Uzochi
Try Document document = Jsoup.parse( file, "ISO-8859-1" );

Related

Append data wrong in csv file

from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
URL = 'https://mergr.com/firms/search/employees?page=1&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'
driver.get(URL)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
urls=[]
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details=soup.select("tbody tr")
for detail in details:
try:
t1 =detail.select_one("h5.profile-title a").text
except:
pass
wev={
'Name':t1,
}
product.append(wev)
page_links =driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
data={
'website':website
}
product.append(data)
df=pd.DataFrame(product)
df.to_csv('firm.csv')
The data of the website will be down in to CSV file as shown in pic is I am appending the data in wrong way why is data moving down where I am wrong ...Kindly recommend where I am wrong there .......
I want output in these format Kindly suggest solution for these...I want output in these format as you shown below...
You can't append wev and data separately - you need website and name in the same dictionary for pandas to know that they belong to same row.
You could add the websites in a separate list like
sites = []
# for url in urls:
# driver.get...
# soup = ....
# try:....except:....
data={
'website':website
}
sites.append(data)
and then zip and combine:
for pi, dictPair in enumerate(zip(product, sites)):
product[pi].update(dictPair[1])
df = pd.DataFrame(product)
df.to_csv('firm.csv')
However, I don't think it's the best way to make sure the right Names and Websites are matched up.
You should just add to the same dictionary for each row from the start instead of zipping and merging.
added_urls = []
product = []
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')

I am trying to iterate on each element in column and click on it, but it gives me error. "can not locate the element"

Please can anyone help me with this code:
I am trying to iterate on each element in column and click on it, but it gives me error. "can not locate the element"
but when I get rid of the for loop and try one element it works.
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(executable_path="C:\Program Files (x86)\chromedriver.exe")
driver.get("http://www.dc.state.fl.us/OffenderSearch/list.aspx?TypeSearch=IR&Page=List&DataAction=Filter&dcnumber=&LastName=a&FirstName=&SearchAliases=1&OffenseCategory=&ZipCode=&ReleaseDateBeg=10%2f01%2f1997&ReleaseDateEnd=&CountyOfCommitment=&StatedCountyOfResidence=&ReleaseFacility=&photosonly=0&nophotos=1&matches=20")
driver.implicitly_wait(10)
for i in range(2,6):
person = driver.find_element(By.XPATH, '//table[#id="ctl00_ContentPlaceHolder1_GrdReleasesPublic"]/tbody/tr[i]/td[1]/a').click()
#person = driver.find_element(By.XPATH,"/html[1]/body[1]/div[5]/div[1]/div[1]/div[1]/form[1]/div[3]/div[1]/div[1]/div[3]/table[1]/tbody[1]/tr[row]/td[1]/a[1]").click()
driver.implicitly_wait(5)
# retriving info about the inmate
person_info = driver.find_element(By.CLASS_NAME, "offenderDetails").text
alias = driver.find_element(By.ID, "ctl00_ContentPlaceHolder1_divAlias").text
al = alias.replace('\n', ' ')
y = person_info + "\n" + al
#print(y)
person_info.strip(',')
with open('readme.txt', 'w') as f:
f.write(y)
#print(person_info)
myfile = open("readme.txt", "r")
data_dic = {}
for line in myfile:
#print(line)
k, v = line.strip('').split(":")
data_dic[k.strip()] = v.strip()
myfile.close()
print(data_dic)
header = ['DC Number', 'Name', 'Race', 'Sex', 'Birth Date', 'Custody', 'Release Date', 'Aliases' ]
new_dic = [data_dic]
print(new_dic)
with open('test4.csv', 'w') as csvfile1:
writer = csv.DictWriter(csvfile1, fieldnames=header)
writer.writeheader()
writer.writerows(new_dic)
driver.get("http://www.dc.state.fl.us/OffenderSearch/list.aspx?TypeSearch=IR&Page=List&DataAction=Filter&dcnumber=&LastName=a&FirstName=&SearchAliases=1&OffenseCategory=&ZipCode=&ReleaseDateBeg=10%2f01%2f1997&ReleaseDateEnd=&CountyOfCommitment=&StatedCountyOfResidence=&ReleaseFacility=&photosonly=0&nophotos=1&matches=20")
driver.implicitly_wait(10)
You don't need to click on each link on the table, rather you can capture all the href value in a list and then iterate and navigate to each page.
Code:
driver.get("http://www.dc.state.fl.us/OffenderSearch/list.aspx?TypeSearch=IR&Page=List&DataAction=Filter&dcnumber=&LastName=a&FirstName=&SearchAliases=1&OffenseCategory=&ZipCode=&ReleaseDateBeg=10%2f01%2f1997&ReleaseDateEnd=&CountyOfCommitment=&StatedCountyOfResidence=&ReleaseFacility=&photosonly=0&nophotos=1&matches=20")
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//table[#id='ctl00_ContentPlaceHolder1_GrdReleasesPublic']")))
#Get list of urls
urlList=[url.get_attribute('href') for url in driver.find_elements(By.XPATH,"//table[#id='ctl00_ContentPlaceHolder1_GrdReleasesPublic']//tbody//td[2]//a")]
for url in urlList:
driver.get(url)
#do what you wish
You need to import below libarries.
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
you can use row as parameter and construct xpath, and get each element's 'href', then new_tab to open the page, and get personal info and alias. It is really easy and straightforward to do that using clicknium:
from clicknium import clicknium as cc
if not cc.chrome.extension.is_installed():
cc.chrome.extension.install_or_update()
tab = cc.chrome.open("http://www.dc.state.fl.us/OffenderSearch/list.aspx?TypeSearch=IR&Page=List&DataAction=Filter&dcnumber=&LastName=a&FirstName=&SearchAliases=1&OffenseCategory=&ZipCode=&ReleaseDateBeg=10%2f01%2f1997&ReleaseDateEnd=&CountyOfCommitment=&StatedCountyOfResidence=&ReleaseFacility=&photosonly=0&nophotos=1&matches=20")
xpath_template = '//*[#id="ctl00_ContentPlaceHolder1_GrdReleasesPublic"]/tbody/tr[{}]/td[2]/a'
row = 2
while True:
xpath = xpath_template.format(row)
if tab.is_existing_by_xpath(xpath):
href = tab.find_element_by_xpath(xpath).get_property('href')
url = "http://www.dc.state.fl.us{}".format(href)
new_tab = tab.browser.new_tab(url)
person_info = new_tab.find_element_by_xpath('//table[#class="offenderDetails"]').get_text()
alias = new_tab.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_divAlias"]').get_text()
al = alias.replace('\n', ' ')
print(person_info)
print(al)
new_tab.close()
row = row + 1
else:
break

How to scrape review data present in Read more in Flipkart reviews

I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those review.
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 10):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
Usage:
Install the requirements by running pip install bs4 selenium.
Add geckodriver to the PATH. Follow these instructions.
Put the link of the product in site variable inside the script.
Run the script by running python scrape.py.
Reviews will be saved in the file review.txt.
Had some issues using #CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs
def get_source_code(browser):
rm_btns = browser.find_elements_by_class_name('_1BWGvX')
for rm_btn in rm_btns:
rm_btn.click()
return browser.page_source
def collect_reviews_attributes(html):
soup_obj = bs(html, "html.parser")
text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
text_tags = [tag.text for tag in text_tag_divs]
heading_tags = [tag.text for tag in heading_tag_divs]
rating_tags = [tag.text for tag in rating_tag_divs]
return list(zip(heading_tags, text_tags, rating_tags))
collector_list = []
browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3 # get from the url dynamically or else give large number and try hitting until u get exception
browser.get(url) # open the url in the browser
for _ in range(num_pages):
page_source_code = get_source_code(browser)
collector_list.extend(collect_reviews_attributes(page_source_code))
next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
next_page.click()

Exception when convert BeautifulSoup to Selenium

I have code as below to scrape site and it is no problems, then I want to only use Selenium so I change code to this, then I got errors, I don't know why, does anyone help me?
webdriver.PhantomJS() Errors
Exception: Message: {"errorMessage":"Element does not exist in cache"
webdriver.Chrome() Error:
Exception: Message: stale element reference: element is not attached to the page document
Selenium only code
driver = webdriver.Chrome() # or webdriver.PhantomJS()
a = driver.find_elements_by_css_selector(findTag + "." + findValue + " a")
img = driver.find_elements_by_css_selector(findTag + "#" + findValue + "img")
href = a.get_attribute('href')
src = img.get_attribute("src")
Selenium + BeautifulSoup code:
driver = webdriver.Chrome() # or webdriver.PhantomJS()
soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
a = soup.find(findTag, class_=findValue).find_all("a")
img = soup.find(findTag, id=findValue).find_all("img")
href = a.get("href")
src = img.get("src")
Have you tried to implement waits? It would go as follow:
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome() # or webdriver.PhantomJS()
# Here check that your image is in the page's document.
wait = driver.WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.ID, "YourImgId")))
# Now try to find it in the DOM
img = driver.find_elements_by_css_selector(findTag + "#" + findValue + "img")
a = driver.find_elements_by_css_selector(findTag + "." + findValue + " a")
href = a.get_attribute('href')
src = img.get_attribute("src")
Hope this helps :)
About waits: http://selenium-python.readthedocs.io/waits.html
Edit: not a wait issue
Just navigate to the page with selenium, enter your credential and then use beautifulsoup to scrape the page. It should then be fine :)
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
ex_path = r"C:\chromedriver_win32\chromedriver.exe"
# Going to the link
driver = webdriver.Chrome(executable_path = ex_path)
driver.get("http://ipcamera-viewer.com/view/?camera_code=199619")
# Enter the password
code = driver.find_element_by_name("pass")
code.send_keys("5042")
code.send_keys(Keys.ENTER)
# Now get the soup
soup = BeautifulSoup(driver.page_source, "html.parser")
element_ = soup.find("ul", id = "grid")
images_links = []
for img in element_.find_all("img"):
images_links.append(img.get("src"))
print images_links[0:2]
Output:
>>> [u'http://ipcamera-viewer.com/image/?p=199619_20170301_201334_5668.jpg', u'http://ipcamera-viewer.com/image/?p=199619_20170301_201329_5611.jpg']

Scraping: scraped links - now unable to scrape and dump html files into a folder

Using Python, Selenium, Sublime and Firefox: I am scraping the links off of this website and would like to save the scraped pages (as html files) into a folder. However, I have been working for days on trying to get the body of these html files to dump into a dropbox folder. The problem is 1) saving the html files and 2) saving them to a dropbox folder (or any folder).
I have successfully written code that will perform a search, then scrape the links off of a series of webpages. The following code works well for that.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import re
import csv
import pickle
import signal
import time
def handler(signum, frame):
raise Exception('Last Resort!')
signal.signal(signal.SIGALRM,handler)
def isReady(browser):
return browser.execute_script("return document.readyState")=="complete"
def waitUntilReady(browser):
if not isReady(browser):
waitUntilReady(browser)
def waitUntilReadyBreak(browser_b,url,counter):
try:
signal.alarm(counter)
waitUntilReady(browser_b)
signal.alarm(0)
except Exception,e:
print e
signal.alarm(0)
browser_b.close()
browser_b = webdriver.Firefox()
browser_b.get(url)
waitUntilReadyBreak(browser_b,url,counter)
return browser_b
browser = webdriver.Firefox()
thisurl = 'http://www.usprwire.com/cgi-bin/news/search.cgi'
browser.get(thisurl)
waitUntilReady(browser)
numarticles = 0
elem = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "query")))
elem = browser.find_element_by_name("query")
elem.send_keys('"test"')
form = browser.find_element_by_xpath("/html/body/center/table/tbody/tr/td/table/tbody/tr[3]/td/table/tbody/tr[3]/td[2]/table/tbody/tr[3]/td/table/tbody/tr[1]/td/font/input[2]").click()
nextpage = False
all_newproduct_links = []
npages = 200
for page in range(1,npages+1):
if page == 1:
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
print page
print article_url
print "END_A_PAGE"
elem = browser.find_element_by_link_text('[>>]').click()
waitUntilReady(browser)
if page >=2 <= 200:
# click the dots
print page
print page
print "B4 LastLoop"
elems = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.CLASS_NAME, "category_links")))
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
print page
print article_url
print "END_C_PAGE"
# This is the part that will not work :(
for e in elems:
numarticles = numarticles+1
numpages = 0
numpages = numpages+1000
article_url = e.get_attribute('href')
print 'waiting'
bodyelem.send_keys(Keys.COMMAND + "2")
browser.get(article_url)
waitUntilReady(browser)
fw = open('/Users/My/Dropbox/MainFile/articlesdata/'+str(page)+str(numpages)+str(numarticles)+'.html','w')
fw.write(browser.page_source.encode('utf-8'))
fw.close()
bodyelem2 = browser.find_elements_by_xpath("//body")[0]
bodyelem2.send_keys(Keys.COMMAND + "1")
The above (for e in elems:) is meant to click on the page and create an html file containing the body of the scraped page. I seem to be missing something fundamental.
Any guidance at all would be most appreciated.
I think you are overcomplicating it.
There is at least one problem in this block:
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
elems would contain a list of elements found by find_elements_by_tag_name(), but then, you are using the same elems variable in the list comprehension. As a result, when you are iterating over elems later, you are getting an error, since elems now refer to a single element and not a list.
Anyway, here is the approach I would take:
gather all the article urls first
iterate over the urls one by one and save the HTML source using the page url name as a filename. E.g. _Iran_Shipping_Report_Q4_2014_is_now_available_at_Fast_Market_Research_326303.shtml would be the article filename
The code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def isReady(browser):
return browser.execute_script("return document.readyState") == "complete"
def waitUntilReady(browser):
if not isReady(browser):
waitUntilReady(browser)
browser = webdriver.Firefox()
browser.get('http://www.usprwire.com/cgi-bin/news/search.cgi')
# make a search
query = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "query")))
query.send_keys('"test"')
submit = browser.find_element_by_xpath("//input[#value='Search']")
submit.click()
# grab article urls
npages = 4
article_urls = []
for page in range(1, npages + 1):
article_urls += [elm.get_attribute("href") for elm in browser.find_elements_by_class_name('category_links')]
browser.find_element_by_link_text('[>>]').click()
# iterate over urls and save the HTML source
for url in article_urls:
browser.get(url)
waitUntilReady(browser)
title = browser.current_url.split("/")[-1]
with open('/Users/My/Dropbox/MainFile/articlesdata/' + title, 'w') as fw:
fw.write(browser.page_source.encode('utf-8'))

Categories