CSV indentation and Proper language in Python - python

I want to return the list that comes out with the print function into a CSV list.
import re
import requests
from bs4 import BeautifulSoup
for i in range(146):
r = requests.get(("http://www.yellowpages.com/atlanta-ga/trends/{}").format(i))
soup = BeautifulSoup(r.content , "html.parser")
for link in soup.find_all("a",href=re.compile('/atlanta-ga/')):
if 'trends' not in link.get('href'):
link = (link.get('href'))
results = (("http://www.yellowpages.com{}?page=").format(link))
import csv
with open('Catagories', 'w') as myfile:
wr = csv.writer(myfile)
wr.writerow([results])
print(results)
The purpose of this should be very apparent

#tdelaney is right: every time you open the file with "w", you're overwritting the previous text.
The fix is to use "a" instead:
with open('Catagories', 'a') as myfile:
...
Check out the docs: https://docs.python.org/3/library/functions.html#open

Related

Perserving source html entities with BeautifulSoup

Is there any way I can perserve HTML entities in the source when parsing it with BeautifulSoup?
from bs4 import BeautifulSoup
soup = BeautifulSoup('<p class="test">"Hello World!" I said')
print(soup.string)
# Outputs: '"Hello World!" I said'
# Wanted/Expected: '"Hello World!" I said'
Also, when writing those preserved html entities back to a file. Will f.write(str(soup)) do? The following code to is meant produce an identical copy of the original, which currently isn't:
from bs4 import BeautifulSoup
from pathlib import Path
# The original contains tons of HTML entities
original = Path("original.html")
output = Path("duplicate.html")
with open(original, "rt", encoding="utf8") as f:
soup = BeautifulSoup(f, "lxml")
with open(output, "wt", encoding="utf8") as f:
f.write(str(soup))
you have to create custom formatter
from bs4 import BeautifulSoup
def formatQuot(string):
return string.replace('"','"')
soup = BeautifulSoup('<p class="test">"Hello World!" I said ', 'html.parser')
print(soup.decode(formatter=formatQuot))
# <p class="test">"Hello World!" I said </p>
text = formatQuot(soup.text)
print(text)
# "Hello World!" I said
Thanks #uingtea for custom formatter suggestion. As I also need to preserve the tag attribute order, I've subclassed the HTMLFormatter as per BeautifulSoup docs:
from pathlib import Path
from bs4 import BeautifulSoup
from bs4.formatter import HTMLFormatter
def my_formatter(string):
string = string.replace('&', '&')
# string = string.replace('…', '…')
string = string.replace('"', '"').replace("'", ''')
string = string.replace('<', '<').replace('>', '>')
return string
class customFormat(HTMLFormatter):
def attributes(self, tag):
for k, v in tag.attrs.items():
yield k, v
cform = customFormat(my_formatter)
original = Path("original.html")
output = Path("output.html")
with open(original, "rt", encoding="utf8") as f:
soup = BeautifulSoup(f, "lxml")
with open(output, "wt", encoding="utf8", newline="\n") as f:
f.write(soup.decode(formatter=cform))
Is there a more "cleaner" way to write the custom parser, i.e. without defining a free function then passing it to the constructor of the subclassed formatter? The docs is pretty scant on how to write a custom/subclassed formatter.

Python - why the print result is repeated and "write to a text" only has one line

Lovely people! I'm totally new with Python. I tried to scrape several URLs and encountered a problem with "print".
I tried to print and write the "shipment status".
I have two URLs, so ideally I get two results.
This is my code:
from bs4 import BeautifulSoup
import re
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
for p in shipment:
# extract information
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
import sys
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
I have two problems here:
Problem one: I have only two URLs, and when I print the results, every "span" is repeated 4 times (as there are four "span"s).
The result in the "output" is as below:
(I deleted the result example to protect privacy.)
Problem two: I tried to write the "print" to a text file, but only one line appeared in the file:
(I deleted the result example to protect privacy.)
I want to know what is wrong in the code. I want to print 2 url results only.
Your help is really appreciated!
Thank you in advance!
First point is caused by iterating over shipment - Just delete the for loop and correct indent of print():
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
Second issue is caused while you call the writing outside the loop and not in append mode - You will end up with this as your loop:
#open file in append mode
with open('somefile.txt', 'a') as f:
#start iterating your urls
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
#create output text
line = f'{url};Preparation{Preparation.getText()};Sent{Sent.getText()};InTransit{InTransit.getText()};Delivered{Delivered.getText()}'
#print output text
print (line)
#append output text to file
f.write(line+'\n')
And you can delete:
import sys
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
Example of a bit optimized code:
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = "randomfile.txt"
with open('somefile.txt', 'a', encoding='utf-8') as f:
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = list(soup.select_one('#progress').stripped_strings)
line = f"{url},{';'.join([':'.join(x) for x in list(zip(shipment[::2], shipment[1::2]))])}"
print (line)
f.write(line+'\n')
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
There are four spans actuelly, try this
for url in line_in_list:
soup = BeautifulSoup(urlopen(url).read(), 'html')
# parse something special in the file
shipments = soup.find_all("span") # there are four span actually;
sys.stdout.write('Url '+url+'; Preparation'+shipments[0].getText()+'; Sent'+shipments[1].getText()+'; InTransit'+shipments[2].getText()+'; Delivered'+shipments[3].getText())
# change line
sys.stdout.write("\r")
First question
You have two nested loops :
for url in line_in_list:
for p in shipment:
print(...)
The print is nested in the second loop. If you have 4 shipments per url, that will lead to 4 prints per url.
Since you don't use p from for p in shipment you can completely get rid of the second loop and move the print one indentation level left, like this :
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
Second question
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
Without keyword argument, print is writing to sys.stdout, which is by default your terminal output. There's only one print after sys.sdtout = ... so there will only be one line written to the file.
There's another way to print to a file :
with open('demo.txt', 'a') as f:
print('Hello world', file = f)
The keyword with will ensure the file is closed even if an exception is raised.
Both combined
From what I understood, you want to print two lines to the file. Here's a solution :
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = "randomfile.txt"
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), "html")
# parse something special in the file
shipment = soup.find_all("span")
Preparation = shipment[0]
Sent = shipment[1]
InTransit = shipment[2]
Delivered = shipment[3]
with open(file_path, "a") as f:
f.write(
f"{url} ; Preparation {Preparation.getText()}; Sent {Sent.getText()}; InTransit {InTransit.getText()}; Delivered {Delivered.getText()}"
)

How to scrape the website properly and getting all td texts from website

I am new to python. is anyone know {sum(int(td.text) for td in soup.select('td:last-child')[1:])} what is use of [1:] in this or [0] or [1]. i saw it in many scraping examples below for in loop. As i was practicing i build this code and don't able to scrape all data in csv file. thanks in advance, sorry for two question at one time.
import requests
from bs4 import BeautifulSoup
import csv
url= "https://iplt20.com/stats/2020/most-runs"
r= requests.get (url)
soup= BeautifulSoup (r.content, 'html5lib')
lst= []
table=soup.find ('div', attrs = {'class':'js-table'})
#for row in table.findAll ('div', attrs= {'class':'top-players__player-name'}):
# score = {}
# score['Player'] = row.a.text.strip()
# lst.append(score)
for row in table.findAll (class_='top-players__m top-players__padded '):
score = {}
score['Matches'] = int(row.td.text)
lst.append(score)
filename= 'iplStat.csv'
with open (filename, 'w', newline='') as f:
w= csv.DictWriter(f,['Player', 'Matches'])
w.writeheader()
for score in lst:
w.writerow(score)
print (lst)
All of this is not even needed. Just use pandas:
import requests
import pandas as pd
url = "https://iplt20.com/stats/2020/most-runs"
r = requests.get (url)
df = pd.read_html(r.content)[0]
df.to_csv("iplStats.csv", index = False)
Screenshot of csv file:

Getting a list output to write correctly in rows in a csv file

I am trying to write this output to a csv file but it is simply not working. I have tried many writing to csv tutorials but none of them work. If you could please direct me to tutorial explaining why this isnt working, I would like to learn the issue and solve it.
import bs4
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup
import csv
myurl = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38'
uclient = ureq(myurl)
page_html = uclient.read()
uclient.close()
page_soup = soup(page_html, 'html.parser')
items = page_soup.find_all('div', {'class':'item-container'})
#filename = 'zeus.csv'
#f = open(filename, 'w')
#header = 'Item Details\n'
#f.write(header)
#contain = items[0]
#container = items[0]
for container in items:
details = container.a.img['title']
with open('zeus.csv', 'w') as f:
f.write(details + "\n")
#print(details)
You can run
with open('zeus.csv', 'w') as f:
for container in items:
details = container.a.img['title']
f.write("{} \n ".format(details))
The problems that were in the code are that with open('zeus.csv', 'w') as f: was in the loop so in each iteration it is overwritten the previous iterations.
You can try something like that for writing list to .csv file :
import csv
#open file
with open(..., 'w', newline='') as your_file:
writer = csv.writer(your_file, quoting=csv.QUOTE_ALL)
# write your list values
writer.writerow(your_list)

Issues while writing special characters to csv file

I am writing the crawled output of a webpage to CSV files. However few special characters such as 'hyphen' is not getting parsed correctly.
Original Text : Amazon Forecast - Now Generally Available
Result in csv : Amazon Forecast – Now Generally Available
I tried the below code
from bs4 import BeautifulSoup
from datetime import date
import requests
import csv
source = requests.get('https://aws.amazon.com/blogs/aws/').text
soup = BeautifulSoup(source, 'lxml')
# csv_file = open('aitrendsresults.csv', 'w')
csv_file = open('aws_cloud_results.csv', 'w' , encoding = 'utf8' )
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['title','img','src','summary'])
match = soup.find_all('div',class_='lb-row lb-snap')
for n in match:
imgsrc= n.div.img.get('src')
titlesrc= n.find('div',{'class':'lb-col lb-mid-18 lb-tiny-24'})
titletxt= titlesrc.h2.text
anchortxt= titlesrc.a.get('href')
sumtxt= titlesrc.section.p.text
print(sumtxt)
csv_writer.writerow([titletxt,imgsrc,anchortxt,sumtxt])
csv_file.close()
Can you please help me to get the text like the same in original text provided above.
Create a function to handle ASCII characters (i.e. Hyphen, Semicolon) and pass the string as argument inside the function below:
def decode_ascii(string):
return string.encode('ascii', 'ignore').decode('ascii')
input_text = 'Amazon Forecast - Now Generally Available'
output_text = decode_ascii(input_text)
print(output_text)
Output should be Amazon Forecast - Now Generally Available in the CSV.
I've been working with BS as well and I think you've only made a minor mistake. In line 8, where you open the csv file, the encoding should be "UTF-8" instead of "utf8".
See if that helps.
Using title as test the following works for me
from bs4 import BeautifulSoup
import requests, csv
source = requests.get('https://aws.amazon.com/blogs/aws/').text
soup = BeautifulSoup(source, 'lxml')
with open("aws_cloud_results.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ";", quoting=csv.QUOTE_MINIMAL)
w.writerow(['title'])
match = soup.find_all('div',class_='lb-row lb-snap')
for n in match:
titlesrc= n.find('div',{'class':'lb-col lb-mid-18 lb-tiny-24'})
titletxt= titlesrc.h2.text
w.writerow([titletxt])

Categories