BeautifulSoup with Table - python

I'm Web Scraping on Beautiful Soup and I am getting an error on line 13: for row in table.findAll('tr').
Its coming up an error on the cmd. Hope someone could help.
import csv
import requests
from bs4 import BeautifulSoup
url='http://www.dublincity.ie/dublintraffic/carparks.htm'
response = requests.get(url)
html= response.content
soup=BeautifulSoup(html)
table=soup.find('tbody', attrs={'id' :'itemsBody'})
list_of_rows=[]
for row in table.findAll('tr'):
list_of_cells=[]
for cell in row.findAll('td'):
text = cell.text.replace(' ','')
list_of_cells.append(text)
list_of_cells.append(list_of_cells)
outfile= open("./carpark.csv", "wb")
writer=csv.writer(outfile)
writer.writerows(["location","spaces"])
writer.writerows(list_of_rows)

If you wanna stick to BeautifulSoup then you can fetch and write the content using its xml parser along with csv.DictWriter(). Check out the implementation:
import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)

You could retrieve the data as an xml doc and then parse. This is just an example of part of process you could tailor.
import requests
from xml.etree import ElementTree
import pandas as pd
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
xml_data = requests.get(url).content
tree = ElementTree.fromstring(xml_data)
parking = []
for child in tree:
for nextChild in child:
parking.append([child.tag ,nextChild.attrib['name'],nextChild.attrib['spaces']])
df = pd.DataFrame(parking)
print(df)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )

Related

Want to Scrap each category individual but either it scraping data in single alphabet form or in a paragraph form

I want to extract Name & Position, Education, Contact number and email all in different column of csv but when I extract it either it is a single block per alphabet or a single column per paragraph(if I list it).Here is the code:
import requests
from bs4 import BeautifulSoup
from csv import writer
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
page = soup.find_all('p')
for i in page:
i = i.text
with open('page.csv', 'a', encoding = 'utf8', newline='') as f:
thewriter = writer(f)
thewriter.writerow(i)
You can use regex to pull out what you need:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
content = soup.find('div', {'id':'divContent'})
p_list = content.find_all('p')
rows = []
for p in p_list:
string = p.text
text = re.search('(^.*) (Education: )(.*)( Contact).*(\d{3}-\d{3}-\d{4})\s*([a-zA-z1-9].*#[\w].*\.[\w].*)', string).groups()
name = text[0]
edu = text[2]
phone = text[4]
email = text[5]
row = {
'name':name,
'education':edu,
'phone':phone,
'email':email}
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv('page.csv', index=False)

How to scrape the website properly and getting all td texts from website

I am new to python. is anyone know {sum(int(td.text) for td in soup.select('td:last-child')[1:])} what is use of [1:] in this or [0] or [1]. i saw it in many scraping examples below for in loop. As i was practicing i build this code and don't able to scrape all data in csv file. thanks in advance, sorry for two question at one time.
import requests
from bs4 import BeautifulSoup
import csv
url= "https://iplt20.com/stats/2020/most-runs"
r= requests.get (url)
soup= BeautifulSoup (r.content, 'html5lib')
lst= []
table=soup.find ('div', attrs = {'class':'js-table'})
#for row in table.findAll ('div', attrs= {'class':'top-players__player-name'}):
# score = {}
# score['Player'] = row.a.text.strip()
# lst.append(score)
for row in table.findAll (class_='top-players__m top-players__padded '):
score = {}
score['Matches'] = int(row.td.text)
lst.append(score)
filename= 'iplStat.csv'
with open (filename, 'w', newline='') as f:
w= csv.DictWriter(f,['Player', 'Matches'])
w.writeheader()
for score in lst:
w.writerow(score)
print (lst)
All of this is not even needed. Just use pandas:
import requests
import pandas as pd
url = "https://iplt20.com/stats/2020/most-runs"
r = requests.get (url)
df = pd.read_html(r.content)[0]
df.to_csv("iplStats.csv", index = False)
Screenshot of csv file:

Scrape table html multipage with beautifulsoup4 and urllib3

Help me please,,
the code I made only works for 1 page, I want it for all pages. what should I do?
import csv
import urllib3
from bs4 import BeautifulSoup
outfile = open("data.csv","w",newline='')
writer = csv.writer(outfile)
for i in range(1,20) :
url = f'http://ciumi.com/cspos/barcode-ritel.php?page={i}'
req = urllib3.PoolManager()
res = req.request('GET', url)
tree = BeautifulSoup(res.data, 'html.parser')
table_tag = tree.select("table")[0]
tab_data = [[item.text for item in row_data.select("th,td")]
for row_data in table_tag.select("tr")]
for data in tab_data:
writer.writerow(data)
print( res, url, ' '.join(data))
Your code is working well, if you want to scrape all the uri and get data from them you just have to correctly indent it:
import csv
import urllib3
from bs4 import BeautifulSoup
outfile = open("data.csv","w",newline='')
writer = csv.writer(outfile)
for i in range(1,20) :
url = f'http://ciumi.com/cspos/barcode-ritel.php?page={i}'
req = urllib3.PoolManager()
res = req.request('GET', url)
tree = BeautifulSoup(res.data, 'html.parser')
table_tag = tree.select("table")[0]
tab_data = [[item.text for item in row_data.select("th,td")] for row_data in table_tag.select("tr")]
for data in tab_data:
writer.writerow(data)
print( res, url, ' '.join(data))
But you have to clean the data to have a pretty csv file

How to webscrape wiki tables of multiple Companies

I am trying to webscrape wiki tables of multiple companies like samsung,alibaba etc,but can't able to so. Below is My code
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
csvFile = open('Information.csv', 'wt+')
writer = csv.writer(csvFile)
lst=['Samsung','Facebook','Google','Tata_Consultancy_Services','Wipro','IBM','Alibaba_Group','Baidu','Yahoo!','Oracle_Corporation']
for a in lst:
html = urlopen("https://en.wikipedia.org/wiki/a")
bs = BeautifulSoup(html, 'html.parser')
table = bs.findAll('table')
for tr in table:
rows = tr.findAll('tr')
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
print(csvRow)
writer.writerow(csvRow)
You are passing a as a string itself, not a reference to one of the items in the list. Here is the corrected code:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
csvFile = open('Information.csv', 'wt+')
writer = csv.writer(csvFile)
lst=['Samsung','Facebook','Google','Tata_Consultancy_Services','Wipro','IBM','Alibaba_Group','Baidu','Yahoo!','Oracle_Corporation']
for a in lst:
html = urlopen("https://en.wikipedia.org/wiki/{}".format(a))
bs = BeautifulSoup(html, 'html.parser')
table = bs.findAll('table')
for tr in table:
rows = tr.findAll('tr')
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
print(csvRow)
writer.writerow(csvRow)
html = urlopen("https://en.wikipedia.org/wiki/a") is where the problem is.
you're looping through lst to get the url for each company but failed to do so by using a string literal in the urlopen method.
the way to solve this is to replace html = urlopen("https://en.wikipedia.org/wiki/a") with either one of the following:
html = urlopen("https://en.wikipedia.org/wiki/" + a)
html = urlopen(f"https://en.wikipedia.org/wiki/{a}") #requires python 3.6+
html = urlopen("https://en.wikipedia.org/wiki/{}".format(a))

Web Scraping Real-Time

I am currently web scraping using BeautifulSoup which is fetched and wrote in xml as shown in the code below, I am just wondering what could I do to make it real time as the website updates every 5 mins.
import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)
You can use a while loop, then at the end you can add a sleep for 5 mins.
Using your example this would be:
import csv
import requests
from bs4 import BeautifulSoup
import time
while True:
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)
time.sleep(5 * 60)

Categories