How can I change python code to scrape text with accented characters? - python

I wrote a code to scrape articles from a particular website so that I can put the csv created from this code to Geneea (text analysis program). The problem is that I wrote this code using unicode, but I then realized I need to scrape the text with accented characters. This code as it is gives me the output I need, but to have text with accented characters is vital for the text analysis program. Do you have any suggestions on how I can change this code?
Thank you all very much!
My code is below:
import requests
from bs4 import BeautifulSoup
import json
import csv
from unidecode import unidecode
def datetonumeric(stringdate):
spliteddate=stringdate.split()
data=(spliteddate[0]).split(".")
day=int(data[0])
themonth=int(data[1])
year=int(data[2])
if(themonth >10 or themonth < 7):
return ""
if(themonth==10 and day>7):
return ""
return f'{day}/{themonth}/{year}'
count=0
with open('parlamentnilistyoutput.csv', 'w', newline='', encoding="UTF-8") as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',')
spamwriter.writerow(['id_clanku','zdroj','datum','title','perex','text','url'])
for i in range(20,70):
r=requests.get(f'https://www.parlamentnilisty.cz/special/Volby%202021?p={i}')
soup = BeautifulSoup(r.text, 'html.parser')
articles=soup.select(".articles-list ul.list-unstyled li")
for article in articles:
try:
# id=(json.loads(article['data-track-list']))['item']['id']
id_clanku=f'PA000{count+1}'
urlselector=(article.select("a"))
url=f"https://www.parlamentnilisty.cz{(urlselector[0])['href']}"
r=requests.get(url)
soup=BeautifulSoup(r.text, 'html.parser')
dateselector=soup.select('div.time')
date=(dateselector[0]).get_text()
date=datetonumeric(date)
print(date)
if(date!=""):
titleselector=soup.select('.article-header h1')
title=titleselector[0].get_text()
title=title.replace(","," ")
pretextselector=soup.select("p.brief")
pretext=pretextselector[0].get_text()
pretext=pretext.replace(","," ")
alltext=soup.select('.article-content > p')
maintext=""
for text in alltext:
maintext=maintext + (text.get_text()).replace("\n"," ")
maintext=maintext+"\n"
maintext=maintext.replace(","," ")
spamwriter.writerow([id_clanku,'parlamentnilisty',date,unidecode(title),unidecode(pretext),unidecode(maintext),url])
count=count+1
except:
print("wrong request")
break

Related

How to scrape the website properly and getting all td texts from website

I am new to python. is anyone know {sum(int(td.text) for td in soup.select('td:last-child')[1:])} what is use of [1:] in this or [0] or [1]. i saw it in many scraping examples below for in loop. As i was practicing i build this code and don't able to scrape all data in csv file. thanks in advance, sorry for two question at one time.
import requests
from bs4 import BeautifulSoup
import csv
url= "https://iplt20.com/stats/2020/most-runs"
r= requests.get (url)
soup= BeautifulSoup (r.content, 'html5lib')
lst= []
table=soup.find ('div', attrs = {'class':'js-table'})
#for row in table.findAll ('div', attrs= {'class':'top-players__player-name'}):
# score = {}
# score['Player'] = row.a.text.strip()
# lst.append(score)
for row in table.findAll (class_='top-players__m top-players__padded '):
score = {}
score['Matches'] = int(row.td.text)
lst.append(score)
filename= 'iplStat.csv'
with open (filename, 'w', newline='') as f:
w= csv.DictWriter(f,['Player', 'Matches'])
w.writeheader()
for score in lst:
w.writerow(score)
print (lst)
All of this is not even needed. Just use pandas:
import requests
import pandas as pd
url = "https://iplt20.com/stats/2020/most-runs"
r = requests.get (url)
df = pd.read_html(r.content)[0]
df.to_csv("iplStats.csv", index = False)
Screenshot of csv file:

Issues while writing special characters to csv file

I am writing the crawled output of a webpage to CSV files. However few special characters such as 'hyphen' is not getting parsed correctly.
Original Text : Amazon Forecast - Now Generally Available
Result in csv : Amazon Forecast – Now Generally Available
I tried the below code
from bs4 import BeautifulSoup
from datetime import date
import requests
import csv
source = requests.get('https://aws.amazon.com/blogs/aws/').text
soup = BeautifulSoup(source, 'lxml')
# csv_file = open('aitrendsresults.csv', 'w')
csv_file = open('aws_cloud_results.csv', 'w' , encoding = 'utf8' )
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['title','img','src','summary'])
match = soup.find_all('div',class_='lb-row lb-snap')
for n in match:
imgsrc= n.div.img.get('src')
titlesrc= n.find('div',{'class':'lb-col lb-mid-18 lb-tiny-24'})
titletxt= titlesrc.h2.text
anchortxt= titlesrc.a.get('href')
sumtxt= titlesrc.section.p.text
print(sumtxt)
csv_writer.writerow([titletxt,imgsrc,anchortxt,sumtxt])
csv_file.close()
Can you please help me to get the text like the same in original text provided above.
Create a function to handle ASCII characters (i.e. Hyphen, Semicolon) and pass the string as argument inside the function below:
def decode_ascii(string):
return string.encode('ascii', 'ignore').decode('ascii')
input_text = 'Amazon Forecast - Now Generally Available'
output_text = decode_ascii(input_text)
print(output_text)
Output should be Amazon Forecast - Now Generally Available in the CSV.
I've been working with BS as well and I think you've only made a minor mistake. In line 8, where you open the csv file, the encoding should be "UTF-8" instead of "utf8".
See if that helps.
Using title as test the following works for me
from bs4 import BeautifulSoup
import requests, csv
source = requests.get('https://aws.amazon.com/blogs/aws/').text
soup = BeautifulSoup(source, 'lxml')
with open("aws_cloud_results.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ";", quoting=csv.QUOTE_MINIMAL)
w.writerow(['title'])
match = soup.find_all('div',class_='lb-row lb-snap')
for n in match:
titlesrc= n.find('div',{'class':'lb-col lb-mid-18 lb-tiny-24'})
titletxt= titlesrc.h2.text
w.writerow([titletxt])

How to write csv and insert scrape data

I am designing scraping project for my research but i am stuck in to write scrape data in csv. Please help me for that?
i have successfully scrape data but i want to store it in csv here below is my code
need to write code to pull all of the html from a website then save it to a csv file.
I believe I somehow need to turn the links into a list and then write the list, but I'm unsure how to do that.
This is what I have so far:
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
print("Wait Scraper is working on ")
time.sleep(10)
if(page.status_code != 200):
print("Error in Scraping check the url")
else:
print("Successfully scrape the data")
time.sleep(10)
print("Loading data in csv")
file = csv.writer(open('dataminer.csv', 'w'))
file.writerow(['ProfileName', 'CompanyName', 'Salary', 'Job', 'Location'])
for pname in soup.find_all(class_="profile-name"):
#print(pname.text)
profname = pname.text
file.writerow([profname, ])
for cname in soup.find_all(class_="company_name"):
print(cname.text)
for salary in soup.find_all(class_="salary"):
print(salary.text)
for lpa in soup.find_all(class_="jobText"):
print(lpa.text)
for loc in soup.find_all(class_="location"):
print(loc.text)
Make a dict and save the data into it then save to csv, check below code!
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
data = []
print("Wait Scrapper is working on ")
if(page.status_code != 200):
print("Error in Srapping check the url")
else:
print("Successfully scrape the data")
for x in soup.find_all('div',attrs={'class':'job-page'}):
data.append({
'pname':x.find(class_="profile-name").text.encode('utf-8'),
'cname':x.find(class_="company_name").text.encode('utf-8'),
'salary':x.find(class_="salary").text.encode('utf-8'),
'lpa':x.find(class_="jobText").text.encode('utf-8'),
'loc':x.find(class_="location").text.encode('utf-8')})
print("Loading data in csv")
with open('dataminer.csv', 'w') as f:
fields = ['salary', 'loc', 'cname', 'pname', 'lpa']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
writer.writerows(data)
Apart from what you have got in other answer, you can scrape and write the content at the same time as well. I used .select() instead of .find_all() to achieve the same.
import csv
import requests
from bs4 import BeautifulSoup
URL = "https://www.myamcat.com/jobs"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'lxml')
with open('myamcat_doc.csv','w',newline="",encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(['pname','cname','salary','loc'])
for item in soup.select(".job-listing .content"):
pname = item.select_one(".profile-name h3").get_text(strip=True)
cname = item.select_one(".company_name").get_text(strip=True)
salary = item.select_one(".salary .jobText").get_text(strip=True)
loc = item.select_one(".location .jobText").get_text(strip=True)
writer.writerow([pname,cname,salary,loc])

Multiple for loops and csv files

I am new here and newbie with python and currently learning some basic stuff, mostly scraping and I encountered a problem that I hope you can help me to solve.
I'm trying to scrape few details from a website and writing them into a CSV file but I'm able to write only the last results into my CSV, apparently my script just overwrite the data.
Also if you find any mistakes on my code or any room for improvement (which I'm sure there are) I'd be glad if you will point them out as well.
Also2, any recommendation for videos/tutorials that can help me improve my python and scraping skills would be appreciated.
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup (source, 'lxml')
csv_file = open('contacts.csv', 'w')
csv_writer = csv.writer (csv_file)
csv_writer.writerow(["department", "name", "position", "phone"])
for department in soup.find_all("div", class_="view-content"):
department_name = department.h3
print (department_name.text)
for contacts in soup.find_all("div", class_="col-md-7 col-xs-10"):
contact_name = contacts.strong
print(contact_name.text)
for position in soup.find_all("div", class_="field-content"):
print(position.text)
for phone in soup.find_all("div", class_="modal-content"):
first_phone = phone.h3
first_phones = first_phone
print(first_phones)
csv_writer.writerow([department_name, contact_name, position, first_phones])
csv_file.close()
Thanks Thomas,
Actually I tweaked my code a little bit by thinking how I can make it simpler (four for loops are too much, no?) so with the following code I solved my problem(dropped the 'department' and 'phones' because some other issues):
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup (source, 'lxml')
f = open("contactslot.csv", "w+")
csv_writer = csv.writer (f)
csv_writer.writerow(["Name", "Position"])
infomation = soup.find_all("div", class_="well profile")
info = information[0]
for info in information:
contact_name = info.find_all("div", class_="col-md-7 col-xs-10")
names = contact_name[0].strong
name = names.text
print (name)
position_name = info.find_all("div", class_="field-content")
position = position_name[0].text
print(position)
print("")
csv_writer.writerow([name, position])
f.close()
Hi Babr welcome to use python. Your answer is good and here is one more little thing you may can do better.
use find replace find_all if you just want one element
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
f = open("/Users/mingjunliu/Downloads/contacts.csv", "w+")
csv_writer = csv.writer(f)
csv_writer.writerow(["Name", "Position"])
for info in soup.find_all("div", class_="well profile"):
contact_name = info.find("div", class_="col-md-7 col-xs-10")
names = contact_name.strong
name = names.text
print(name)
position_name = info.find("div", class_="field-content")
position = position_name.text
print(position)
print("")
csv_writer.writerow([name, position])
f.close()
And the reason you need to drop phone and department is because of the bad website structure. It's not your fault.

Loop not working for scraping data using python and beautifulsoup4

My goal is to scrape data from the PGA website to extract all the golf course locations in the USA. I aim to scrape from the 907 pages the name, address, ownership, phone number, and website.
I have created the script below but when the CSV is created it produces errors. The CSV file created from the script has data repetitions of the first few pages and the pages of the website. It does not give the whole data of the 907 pages.
How can I fix my script so that it will scrape all 907 pages and produce a CSV with all the golf courses listed on the PGA website?
Below is my script:
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('PGA_Data.csv','a') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
Her is the code that you want. It will first parse the current page before going on to the next one. (There are some blank rows, I hope you can fix that yourself).
import csv
import requests
from bs4 import BeautifulSoup
def encode(l):
out = []
for i in l:
text = str(i).encode('utf-8')
out.append(''.join([i if ord(i) < 128 else ' ' for i in text])) #taken from Martjin Pieter's answer
# http://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space/20078869#20078869
return out
courses_list = []
for i in range(5): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
for item in g_data2:
try:
name = item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1= item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2= item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website= item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber= item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(encode(course))
with open ('PGA_Data.csv','a') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
EDIT: After the inevitable problems of unicode encoding/decoding, I have modified the answer and it will (hopefully) work now. But http://nedbatchelder.com/text/unipain.html see this.

Categories