Producing a Nonetype error - Python - python

When moving from the get_page_data function to for loop to extract business name, address etc. there's a problem which is giving a Nonetype error. I now know that means that a value of None is being passed into the for loop, but I'm not sure what's causing this error:
AttributeError: 'NoneType' object has no attribute 'text'
#!/opt/local/bin/python
import requests
import re
from bs4 import BeautifulSoup
import csv
#Read csv
with open ("gyms4.csv") as file:
reader = csv.reader(file)
csvfilelist = [row[0] for row in reader]
#Get data from each url
def get_page_data():
for page_data in csvfilelist:
r = requests.get(page_data.strip())
soup = BeautifulSoup(r.text, 'html.parser')
yield soup
#Complete work on data
for page in get_page_data():
name = page.find("span",{"class":"wlt_shortcode_TITLE"}).text
address = page.find("span",{"class":"wlt_shortcode_map_location"}).text
phoneNum = page.find("span",{"class":"wlt_shortcode_phoneNum"}).text
email = page.find("span",{"class":"wlt_shortcode_EMAIL"}).text
th = pages.find('b',text="Category")
td = th.findNext()
for link in td.findAll('a',href=True):
match = re.search(r'http://(\w+).(\w+).(\w+)', link.text)
if match:
web_address = link.text
gyms = [name,address,phoneNum,email,web_address]
gyms.append(gyms)
#Saving specific listing data to csv
with open ("xgyms.csv", "w") as file:
writer = csv.writer(file)
for row in gyms:
writer.writerow([row])

Related

How to STOP getting an empty CSV file with Scraping

When i run the code and i get my CSV file, its actually empty.
'''
import requests
from bs4 import BeautifulSoup
from csv import writer
url = 'https://www.fotocasa.es/es/alquiler/todas-las-casas/girona-provincia/todas-las-zonas/l'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('section', class_='re-CardPackAdvance')
with open('casas.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Titulo', 'Precio', 'Metros', 'Telefono']
thewriter.writerow(header)
for list in lists:
titulo = list.find('a', class_='re-CardPackAdvance-info-container').text.replace('\n', '')
precio = list.find('span', class_='re-CardPrice').text.replace('\n', '')
metros = list.find('span', class_='re-CardFeaturesWithIcons-feature-icon--surface').text.replace('\n', '')
telefono = list.find('a', class_='re-CardContact-phone').text.replace('\n', '')
info = [titulo, precio, metros, telefono]
thewriter.writerow(info)
'''
I expected to have all the info scrapped from this website, but seems like i did something wrong at some point
You are parsing the resulting soup not appropriately. There is no section with the re-CardPackAdvance class. I adapted the code accordingly (find all articles with class that starts with re-CardPack). Please also note that you need to shift the for-loop by one indention. However, due to the structure of the page, only the first two entries are loaded directly when fetching the page. All other entries are fetched after the page has loaded in the browser (via javascript). I think you might consider using the API of the page instead.
import requests
from bs4 import BeautifulSoup
from csv import writer
import re
url = 'https://www.fotocasa.es/es/alquiler/todas-las-casas/girona-provincia/todas-las-zonas/l'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all("article", class_=re.compile("^re-CardPack"))
print(len(lists))
with open('casas.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Titulo', 'Precio', 'Metros', 'Telefono']
thewriter.writerow(header)
for list in lists:
titulo = list.find('a').get('title')
precio = list.find('span', class_='re-CardPrice').text.replace('\n', '')
metros = list.find('span', class_='re-CardFeaturesWithIcons-feature-icon--surface').text.replace('\n', '')
telefono = list.find('a', class_='re-CardContact-phone').text.replace('\n', '')
info = [titulo, precio, metros, telefono]
thewriter.writerow(info)

Issue using BeautifulSoup and reading target URLs from a CSV

Everything works as expected when I'm using a single URL for the URL variable to scrape, but not getting any results when attempting to read links from a csv. Any help is appreciated.
Info about the CSV:
One column with a header called "Links"
300 rows of links with no space, commoa, ; or other charters before/after the links
One link in each row
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
#print(res.url)
url = res
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_= "app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_= "app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_= "app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_= "app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0,email_elm1,email_elm2,email_elm3)
print(final_email_elm)
df = pd.DataFrame(final_email_elm)
#getting an output in csv format for the dataframe we created
#df.to_csv('draft_part2_scrape.csv')
The problem lies in this part of the code:
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
...
After the loop is executed, res will have the last link. So, this program will only scrape the last link.
To solve this problem, store all the links in a list and iterate that list to scrape each of the link. You can store the scraped result in a seperate dataframe and concatenate them at the end to store in a single file:
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
links = []
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
links.append(link['Links'])
dfs = []
for url in links:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_="app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_="app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_="app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_="app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0, email_elm1, email_elm2, email_elm3)
print(final_email_elm)
dfs.append(pd.DataFrame(final_email_elm))
#getting an output in csv format for the dataframe we created
df = pd.concat(dfs)
df.to_csv('draft_part2_scrape.csv')

How to webscrape wiki tables of multiple Companies

I am trying to webscrape wiki tables of multiple companies like samsung,alibaba etc,but can't able to so. Below is My code
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
csvFile = open('Information.csv', 'wt+')
writer = csv.writer(csvFile)
lst=['Samsung','Facebook','Google','Tata_Consultancy_Services','Wipro','IBM','Alibaba_Group','Baidu','Yahoo!','Oracle_Corporation']
for a in lst:
html = urlopen("https://en.wikipedia.org/wiki/a")
bs = BeautifulSoup(html, 'html.parser')
table = bs.findAll('table')
for tr in table:
rows = tr.findAll('tr')
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
print(csvRow)
writer.writerow(csvRow)
You are passing a as a string itself, not a reference to one of the items in the list. Here is the corrected code:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
csvFile = open('Information.csv', 'wt+')
writer = csv.writer(csvFile)
lst=['Samsung','Facebook','Google','Tata_Consultancy_Services','Wipro','IBM','Alibaba_Group','Baidu','Yahoo!','Oracle_Corporation']
for a in lst:
html = urlopen("https://en.wikipedia.org/wiki/{}".format(a))
bs = BeautifulSoup(html, 'html.parser')
table = bs.findAll('table')
for tr in table:
rows = tr.findAll('tr')
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
print(csvRow)
writer.writerow(csvRow)
html = urlopen("https://en.wikipedia.org/wiki/a") is where the problem is.
you're looping through lst to get the url for each company but failed to do so by using a string literal in the urlopen method.
the way to solve this is to replace html = urlopen("https://en.wikipedia.org/wiki/a") with either one of the following:
html = urlopen("https://en.wikipedia.org/wiki/" + a)
html = urlopen(f"https://en.wikipedia.org/wiki/{a}") #requires python 3.6+
html = urlopen("https://en.wikipedia.org/wiki/{}".format(a))

Skipp the error while scraping a list of urls form a csv

I managed to scrape a list of urls from a CSV file, but I got a problem, the scraping stops when it hits a broken link. Also it prints a lot of None lines, is it possible to get rid of them ?
Would appreciate some help here. Thank you in advance !
Here is the code :
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup #required to parse html
import requests #required to make request
#read file
with open('urls.csv','r') as f:
csv_raw_cont=f.read()
#split by line
split_csv=csv_raw_cont.split('\n')
#specify separator
separator=";"
#iterate over each line
for each in split_csv:
#specify the row index
url_row_index=0 #in our csv example file the url is the first row so we set 0
#get the url
url = each.split(separator)[url_row_index]
#fetch content from server
html = requests.get(url).content
#soup fetched content
soup = BeautifulSoup(html,'lxml')
tags = soup.find("div", {"class": "productsPicture"}).findAll("a")
for tag in tags:
print(tag.get('href'))
And the result with the error looks like this :
https://www.tennis-point.com/asics-gel-resolution-7-all-court-shoe-men-white-silver-02013802720000.html
None
https://www.tennis-point.com/cep-ultralight-run-sports-socks-men-black-light-green-12143000063000.html
None
https://www.tennis-point.com/asics-gel-solution-speed-3-clay-court-shoe-men-white-grey-02013802634000.html
None
https://www.tennis-point.com/asics-gel-solution-speed-3-all-court-shoe-men-white-silver-02013802723000.html
None
https://www.tennis-point.com/asics-gel-challenger-9-indoor-carpet-shoe-men-white-grey-02012401735000.html
None
https://www.tennis-point.com/asics-gel-court-speed-clay-court-shoe-men-dark-blue-yellow-02014202833000.html
None
https://www.tennis-point.com/asics-gel-court-speed-all-court-shoe-men-white-silver-02014202832000.html
None
Traceback (most recent call last):
File "/Users/imaging-adrian/Desktop/Python Scripts/close_to_work.py", line 33, in <module>
tags = soup.find("div", {"class": "productsPicture"}).findAll("a")
AttributeError: 'NoneType' object has no attribute 'findAll'
[Finished in 3.7s with exit code 1]
[shell_cmd: python -u "/Users/imaging-adrian/Desktop/Python
Scripts/close_to_work.py"]
[dir: /Users/imaging-adrian/Desktop/Python Scripts]
[path: /Users/imaging-adrian/anaconda3/bin:/Library/Frameworks/Python.framework/Versions/3.6/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/munki]
The links inside my CSV files look like this :
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E701Y-0193;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E601N-4907;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E601N-0193;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E600N-0193;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E326Y-0174;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E801N-4589;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E800N-0193;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E800N-9093;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E800N-4589;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E804N-9095;
Here is working version,
from bs4 import BeautifulSoup
import requests
import csv
with open('urls.csv', 'r') as csvFile, open('results.csv', 'w', newline='') as results:
reader = csv.reader(csvFile, delimiter=';')
writer = csv.writer(results)
for row in reader:
# get the url
url = row[0]
# fetch content from server
html = requests.get(url).content
# soup fetched content
soup = BeautifulSoup(html, 'html.parser')
divTag = soup.find("div", {"class": "productsPicture"})
if divTag:
tags = divTag.findAll("a")
else:
continue
for tag in tags:
res = tag.get('href')
if res != None:
writer.writerow([res])

BeautifulSoup / Python - Convert HTML table to CSV and get href for one column

I am grabbing an HTML table with this code :
import csv
import urllib2
from bs4 import BeautifulSoup
with open('listing.csv', 'wb') as f:
writer = csv.writer(f)
for i in range(39):
url = "file:///C:/projects/HTML/Export.htm".format(i)
u = urllib2.urlopen(url)
try:
html = u.read()
finally:
u.close()
soup=BeautifulSoup(html)
for tr in soup.find_all('tr')[2:]:
tds = tr.find_all('td')
row = [elem.text.encode('utf-8') for elem in tds]
writer.writerow(row)
Everything works perfectly, but I am trying to grab column 9 Href URL. It is currently giving me the txt value but not the URL.
Also, I have two tables in my HTML, anyway to skip the first table and just build the csv file using the second table?
Any help is very welcomed as I am new to Python and need this for a project I am automating a daily conversion.
Many thanks!
You should access the href attribute of the a tag within the 8th td tag:
import csv
import urllib2
from bs4 import BeautifulSoup
records = []
for index in range(39):
url = get_url(index) # where is the formatting in your example happening?
response = urllib2.urlopen(url)
try:
html = response.read()
except Exception:
raise
else:
my_parse(html)
finally:
try:
response.close()
except (UnboundLocalError, NameError):
raise UnboundLocalError
def my_parse(html):
soup = BeautifulSoup(html)
table2 = soup.find_all('table')[1]
for tr in table2.find_all('tr')[2:]:
tds = tr.find_all('td')
url = tds[8].a.get('href')
records.append([elem.text.encode('utf-8') for elem in tds])
# perhaps you want to update one of the elements of this last
# record with the found url now?
# It's more efficient to write only once
with open('listing.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(records)
I have taken the liberty to define a function get_url based on the index because your example rereads the same file every time, which is something I guess you don't actually want. I'll leave the implementation to you. Also, I've added some better exception handling.
At the same time, I've shown how you can access the 2nd table from that webpage's tables.
Was fully able to get it working with the following code:
import csv
import urllib2
from bs4 import BeautifulSoup
#Grab second table from HTML
def my_parse(html):
soup = BeautifulSoup(html)
table2 = soup.find_all('table')[1]
for tr in table2.find_all('tr')[2:]:
tds = tr.find_all('td')
url = tds[8].a.get('href')
tds[8].a.replaceWith(url)
records.append([elem.text.encode('utf-8') for elem in tds])
records = []
#Read HTML file into memory
for index in range(39):
url = "file:///C:/projects/HTML/Export.htm".format(index)
response = urllib2.urlopen(url)
try:
html = response.read()
except Exception:
raise
else:
my_parse(html)
finally:
try:
response.close()
except (UnboundLocalError, NameError):
raise UnboundLocalError
#Writing CSV file
with open('listing.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(records)
Many thanks for all the help!!!!!

Categories