Selenium Python web scraping UTF-8 - python

Maybe this question was asked before but since I could not find a proper answer, I dare to ask a similar one. My problem is, I have been trying to scrape a Turkish car sale web site which is named 'Sahibinden'. I use jupyter notebook and sublime editors.Once I try to get the data written in a csv file, the Turkish letter changes to different characters. I tried. 'UTF-8 Encoding', '# -- coding: utf-8 --', ISO 8859-9, etc. but I could not solve the problem. The other issue is that Sublime editor does not create the csv file despite I did not have any problem on the jupyter notebook. You will find the csv file output in the image link. If someone can reply me I would appreciate it.
Note: the program works and no problem once I run print command on the editors.
Thanks a lot.
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
import unicodedata
with open ('result1.csv','w') as f:
f.write('brand, model, year, oil_type, gear, odometer, body, hp,
eng_dim, color, warranty, condition, price, safe,
in_fea, outs_fea, mul_fea,pai_fea, rep_fea, acklm \n')
chrome_path = r"C:\Users\Mike\Desktop\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
def final_page(fn_20):
for lur in fn_20:
driver.get(lur)
brand = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[3]/span''')
brand = brand.text
brand = brand.encode("utf-8")
print (brand)
model = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[5]/span''')
model = model.text
model = model.encode("utf-8")
print (model)
year = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[6]/span''')
year = year.text
year = year.encode("utf-8")
print (year)
oil_type = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[7]/span''')
oil_type = oil_type.text
oil_type = oil_type.encode("utf-8")
print (oil_type)
gear = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[8]/span''')
gear = gear.text
gear = gear.encode("utf-8")
print (gear)
odometer = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[9]/span''')
odometer = odometer.text
odometer = odometer.encode("utf-8")
print (odometer)
body = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[10]/span''')
body = body.text
body = body.encode("utf-8")
print (body)
hp = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[11]/span''')
hp = hp.text
hp = hp.encode("utf-8")
print (hp)
eng_dim = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[12]/span''')
eng_dim = eng_dim.text
eng_dim = eng_dim.encode("utf-8")
print (eng_dim)
color = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[14]/span''')
color = color.text
color = color.encode("utf-8")
print (color)
warranty = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[15]/span''')
warranty = warranty.text
warranty = warranty.encode("utf-8")
print (warranty)
condition = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[19]/span''')
condition = condition.text
condition = condition.encode("utf-8")
print (condition)
price = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/h3''')
price = price.text
price = price.encode("utf-8")
print (price)
safe = ''
safety1 = driver.find_elements_by_xpath('''//div[#id='classifiedProperties']/ul[1]/li[#class='selected']''')
for ur in safety1:
ur1 = ur.text
ur1 = ur1.encode("utf-8")
safe +=ur1 + ', '
print (safe)
in_fea = ''
in_features = driver.find_elements_by_xpath('''//div[#id='classifiedProperties']/ul[2]/li[#class='selected']''')
for ins in in_features:
ins1 = ins.text
ins1 = ins1.encode("utf-8")
in_fea += ins1 + ', '
print (in_fea)
outs_fea = ''
out_features = driver.find_elements_by_xpath('''//div[#id='classifiedProperties']/ul[3]/li[#class='selected']''')
for outs in out_features:
out1 = outs.text
out1 = out1.encode("utf-8")
outs_fea += out1 + ', '
print (outs_fea)
mul_fea = ''
mult_features = driver.find_elements_by_xpath('''//div[#id='classifiedProperties']/ul[4]/li[#class='selected']''')
for mults in mult_features:
mul = mults.text
mul = mul.encode("utf-8")
mul_fea += mul + ', '
print (mul_fea)
pai_fea = ''
paint = driver.find_elements_by_xpath('''//div[#class='classified-pair custom-area ']/ul[1]/li[#class='selected']''')
for pai in paint:
pain = pai.text
pain = pain.encode("utf-8")
pai_fea += pain + ', '
print (pai_fea)
rep_fea = ''
replcd = driver.find_elements_by_xpath('''//div[#class='classified-pair custom-area']/ul[2]/li[#class='selected']''')
for rep in replcd:
repa = rep.text
repa = repa.encode("utf-8")
rep_fea += rep + ', '
print (rep_fea)
acklm = driver.find_element_by_xpath('''//div[#id='classified-detail']/div[#class='uiBox'][1]/div[#id='classifiedDescription']''')
acklm = acklm.text
acklm = acklm.encode("utf-8")
print (acklm)
try:
with open ('result1.csv', 'a') as f:
f.write (brand + ',' [enter image description here][1]+ model + ',' + year + ',' + oil_type + ',' + gear + ',' + odometer + ',' + body + ',' + hp + ',' + eng_dim + ',' + color + ',' + warranty + ',' + condition + ',' + price + ',' + safe + ',' + in_fea + ',' + outs_fea + ',' + mul_fea + ',' + pai_fea + ',' + rep_fea + ',' + acklm + '\n')
except Exception as e:
print (e)
driver.close

import codecs
file = codecs.open("utf_test", "w", "utf-8")
file.write(u'\ufeff')
file.write("test with utf-8")
file.write("字符")
file.close()
or this also works for me
with codecs.open("utf_test", "w", "utf-8-sig") as temp:
temp.write("this is a utf-test\n")
temp.write(u"test")

Related

Etherscan api output wont update

when i use this loop everytime it prints out the gas prices it will stay the same and wont print out the most recent gas prices
while True :
eth = Etherscan("keys")
gas_oracle = eth.get_gas_oracle()
safe_gas = gas_oracle["SafeGasPrice"]
proposed_gas = gas_oracle["ProposeGasPrice"]
fast_gas = gas_oracle["FastGasPrice"]
safe_message = "Safe:" + safe_gas + ' gwei'
proposed_gas_message = "Proposed:" + proposed_gas + " gwei"
fast_gas_message = "Fast:" + fast_gas + " gwei"
price_in_eth = int(safe_gas) * 0.00000000111
print_gas_prices = (f'gas prices :' '\n'
f' {safe_message} {proposed_gas_message} {fast_gas_message} ')
print(Agas_prices)
sleep(9)

Extract values ​from specific parts of a text file

I have a text file from which I need to extract some values, but several times I get more than one value between the beginning and the last line that starts with 0089 (CONVENIO) in the quadrant.
With my code I can only make it write in the txt the title I defined and it keeps repeating the first agreement found, but I need it to scroll through the text and bring me new information from the other quadrants.
I need the loop as it can have multiple covenants, for this reason I can't directly "anchor" the lines in a size forecast.
import re
import os
inicio = (' YM-INFRA-CASH MANAGMENT DEST.: 001-0001-CENTRAL ')
lista = []
contador = 3
banco = ' 0089'
convenio = ''
with open(caminho + '/Downloads/TESTE.txt', 'r') as arquivo:
for line in arquivo:
if line.strip() == inicio.strip():
localizar = arquivo.readlines()
inicio = localizar[contador]
van = inicio[13:17]
nomevan = inicio[20:50].strip()
inicio = localizar[contador + 1]
ag = inicio[13:17]
nomeag = inicio[20:50].strip()
inicio = localizar[contador + 2]
cliente = inicio[13:50].strip()
contadorum = 9
while localizar[contadorum][1:5] == '0033':
convenio = localizar[contadorum][1:22].strip()
narqrem = localizar[contadorum][22:34].strip()
bytesrem = localizar[contadorum][34:51].strip()
narqret = localizar[contadorum][51:63].strip()
bytesret = localizar[contadorum][63:81].strip()
totalbytes = localizar[contadorum][81:99].strip()
percrateio = localizar[contadorum][99:112].strip()
print(van, nomevan)
print(ag, nomeag)
print(cliente)
print(convenio, narqrem, bytesrem, narqret, bytesret, totalbytes, percrateio)
lista.append(convenio + narqrem + bytesrem + narqret + bytesret + totalbytes + percrateio +'\n')
with open(caminho + '/Downloads/testefim.txt', 'w') as consolidado:
consolidado.write('CONVENIO' + ';' + 'N ARQ REMES' + ';' + 'BYTES REMES' + ';' + 'N ARQ.RET.' + ';' + 'BYTES RET.' + ';' + 'TOTAL BYTES' + ';' + '% RATEIO' + '\n')
for linha in lista:
consolidado.write(convenio + ';' + narqrem + ';' + bytesrem + ';' + narqret + ';' + bytesret + ';' + totalbytes + ';' + percrateio + '\n')
consolidado.close()
else:
pass
arquivo.close()

python read all files from a folder and write the file name and other info into a txt file

I have 30911 html files. I need to do webscraping and then save the info into a txt file named index.txt.
It should look like
filename1, title, t1, date, p1
filename2, title, t1, date, p1
filename3, title, t1, date, p2
and so on...
I only want filename, but output gave me path+filename.
Your problem is that filename is filepath in reality, in order to get the filename you could use os module
os.path.basename('filepath')
so in order to write to the file:
indexFile.write(os.path.basename(filename)+ ', ' + title.get_text(strip=True) + ', '+ ticker.get_text(strip=True) + ', ' + d_date.get_text(strip=True) + ', ' + parti_names + '\n')
You can use:
path = 'C:/Users/.../.../output/'
#read html files
for filename in glob.glob(os.path.join(path, '*.html')):
soup = bs4.BeautifulSoup(open(filename).read(), "lxml")
title = soup.find('h1')
ticker = soup.find('p')
d_date = soup.find_all('div', {"id": "a-body"})[0].find_all("p")[2]
try:
def find_participant(tag):
return tag.name == 'p' and tag.find("strong", text=re.compile(r"Executives|Corporate Participants"))
participants = soup.find(find_participant)
parti_names = ""
for parti in participants.find_next_siblings("p"):
if parti.find("strong", text=re.compile(r"(Operator)")):
break
parti_names += parti.get_text(strip=True) + ","
except:
indexFile = open('C:/Users/.../output1/' + 'index.txt', 'a+')
indexFile.write(filename + ', ' + title.get_text(strip=True) + ', '+ ticker.get_text(strip=True) + ', ' + d_date.get_text(strip=True) + ', ' + 'No participants' + '\n')
else:
participants = soup.find(find_participant)
parti_names = ""
for parti in participants.find_next_siblings("p"):
if parti.find("strong", text=re.compile(r"(Operator)")):
break
parti_names += parti.get_text(strip=True) + ","
indexFile = open('C:/Users/.../output1/' + 'index.txt', 'a+')
indexFile.write(os.path.basename(filename) + ', ' + title.get_text(strip=True) + ', '+ ticker.get_text(strip=True) + ', ' + d_date.get_text(strip=True) + ', ' + parti_names + '\n')
indexFile.close()
ntpath is another module used to get base name from path.
>>> import ntpath
>>> ntpath.basename('C:/Users/.../output1/' + 'index.txt')
'index.txt'

Retrieve Data and Link in Beautiful Soup and retrieve data if Link not present

Ok I wasn't clear enough before. So what I am trying to do is take the list of college teams and their url from http://www.cfbstats.com/2014/player/index.html and export to csv. I have done that successfully. From there I am going into each team and grabbing each player and their link. If a player does not have a link then it will just put their data in the csv. I currently only have players with URLs but not ones without. Eventually I will want to go into each player and grab each of their stats and write to a csv.
Sorry for all the confusion in the original post.
import csv
import sys
import json
import urllib
import requests
from bs4 import BeautifulSoup
def getCollegeandURL():
f = open('colleges.csv', 'w')
f.write("Teams" + "," + "," + "URL" + '\n')
originalurl = "http://www.cfbstats.com/2014/player/index.html"
base = requests.get("http://www.cfbstats.com/2014/player/index.html")
base = base.text
soup = BeautifulSoup(base)
# this is to find all the colleges in the div conference
mydivs = soup.find_all('div',{'class': 'conference'})
##g
g = open('rosters.csv', 'w')
g.write("College Rosters" + '\n' + '\n' + 'College' + ',' + ',' + 'Playernumber' + ',' + 'Player Last Name' + ',' +'Player First Name' + ',' + 'Position' + ',' + 'Year' + ',' + 'Height' + ',' + ' Weight' + ',' +'Hometown' + ',' + 'State' + ',' + 'Last School' + ',' + '\n')
# this for loop finds writes each college to a line
for div in mydivs:
urls= div.findAll('a')
# this is to pull all the college names and each of their links
for url in urls:
college = url.text
url = url.attrs['href']
teamurl = originalurl[:23]+url
f.write(college[:]+ ',' + ',' + teamurl[:]+'\n')
scrapeRosters(college, teamurl, g)
def scrapeRosters(college, teamurl, g):
# g is the excel document to read into
# college is the college name
# teamurl is the url link to that team's roster
roster = requests.get(teamurl)
roster = roster.text
roster = BeautifulSoup(roster)
teamname = roster.find_all('h1' , {'id': 'pageTitle'})
teamAndPlayers = {}
table = roster.find_all('table', {'class' : 'team-roster'})
for i in table:
rows = i.find_all('tr')
for row in rows[1:]:
# this retrieves the player url
for item in row.findAll('a'):
if item not in row.findAll('a'):
row = row.text
row = row.split('\n')
row = str(row)
g.write(college + ',' + row + ',' + ',' + '\n')
elif (item['href'].startswith('/')):
playerurl = item.attrs['href']
row = row.text
row = row.split('\n')
row = str(row)
g.write(college + ',' + row + ',' + ',' + playerurl + ',' + '\n')
def main():
getCollegeandURL()
main()
The error I believe is in my if and elif statement.
import urllib, bs4
data = urllib.urlopen('http://www.cfbstats.com/2014/team/140/roster.html')
soup = bs4.BeautifulSoup(data.read()) # creates a BS4 HTML parsing object
for row in soup('tr')[1:]:
data = [str(i.getText()) for i in row('td')]
link = row('td')[1]('a') # the linked player
if len(link) > 0:
link = str(link[0]['href'])
data = [str(link)] + data
print data
print '\n'

Beautiful Soup Strip html caption tags, th class tags and retrieve data not in list

So I have created a web scraper that goes into cfbstats.com/2014/player/index.html and retrieves all the college football teams and the links of the football teams. From there it goes into each link and takes the roster and players link. Finally it goes into each players link and takes his stats.
I am currently having a problem with the taking the players stats. When I call the header of each table I get printed output [Tackle] and when call the first row of the table I get [G]. I would like to get rid of those tags. I have been able to not have them for my past functions. Any help would be appreciated.
import csv
import sys
import json
import urllib
import requests
from bs4 import BeautifulSoup
import xlrd
import xlwt
def getCollegeandURL():
f = open('colleges.csv', 'w')
f.write("Teams" + "," + "," + "URL" + '\n')
originalurl = "http://www.cfbstats.com/2014/player/index.html"
base = requests.get("http://www.cfbstats.com/2014/player/index.html")
base = base.text
soup = BeautifulSoup(base)
# this is to find all the colleges in the div conference
mydivs = soup.find_all('div',{'class': 'conference'})
##g is an excel document for the roster
g = open('rosters.csv', 'w')
g.write("College Rosters" + '\n' + '\n' + 'College' + ',' + 'Playernumber' + ',' + 'Player Last Name' + ',' +'Player First Name' + ',' + 'Position' + ',' + 'Year' + ',' + 'Height' + ',' + ' Weight' + ',' +'Hometown' + ',' + 'State' + ',' + 'Last School' + ',' + '\n')
# h is an excel for each player stats
h = xlwt.Workbook()
# this for loop finds writes each college to a line
for div in mydivs:
urls= div.findAll('a')
# this is to pull all the college names and each of their links
for url in urls:
college = url.text
url = url.attrs['href']
teamurl = originalurl[:23]+url
f.write(college[:]+ ',' + ',' + teamurl[:]+'\n')
scrapeRosters(college, teamurl, g, h)
############################################################################
def scrapeRosters(college, teamurl, g, h):
# create the excel documents
# this gets the pages of teams
roster = requests.get(teamurl)
roster = roster.text
roster = BeautifulSoup(roster)
teamname = roster.find_all('h1' , {'id': 'pageTitle'})
teamAndPlayers = {}
table = roster.find_all('table', {'class' : 'team-roster'})
for i in table:
rows = i.find_all('tr')
for row in rows[1:]:
data = [str(i.getText()) for i in row('td')]
link = row('td')[1]('a')
if len(link) > 0:
link = str(link[0]['href'])
data = [str(link)] + data
# unpacking data into variables
(playerurl, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool) = data
# creating the full player url
playerurl = teamurl[:23] + playerurl
# repacking the data
data = (college, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool)
g.write(college + ',' + playernumber + ',' + playerName + ',' + playerPosition + ','+ YearinCollege + ',' + playerHeight + ',' + playerWeight + ',' + playerHometown + ',' + lastSchool+ ',' + ',' + playerurl + ',' + '\n')
playerStats(data, playerurl, h)
############################################################################
def playerStats(data,playerurl, h):
playerurl = requests.get(playerurl)
playerurl = playerurl.text
playerurl = BeautifulSoup(playerurl)
tablestats = playerurl.find_all('table', {'class' : 'player-home'})
(college, playernumber, playerName, playerPosition,YearinCollege, playerHeight, playerWeight, playerHometown, lastSchool) = data
#print college, playernumber, playerName
print college, playerName, playernumber
for x in tablestats:
caption = x.find_all('caption')
rows = x.find_all('tr')
## caption = caption.strip
for row in rows:
headers = x.find_all('th')
headers = [str(i.getText()) for i in row('tr')]
stats = [str(x.getText()) for x in row('td')]
print caption, headers, stats
############################################################################
def main():
getCollegeandURL()
main()
Don't work so hard, your data is already available in parseable form.

Categories