I'm almost finished a webcralwer grabbing a table. This outputs the first row in the table only. Can anyone help identify why this does not return all rows in the table. Please ignore the while loop as this will eventually have a looped section.
import urllib
from bs4 import BeautifulSoup
#file_name = "/user/joe/uspc-cpc.txt
#file = open(file_name,"w")
i=125
while i==125:
url = "http://www.uspto.gov/web/patents/classification/cpc/html/us" + str(i) + "tocpc.html"
print url + '\n'
i += 1
data = urllib.urlopen(url).read()
print data
#get the table data from dump
#append to csv file
soup = BeautifulSoup(data)
table = soup.find("table", width='80%')
for tr in table.findAll('tr')[2:]:
col = row.findAll('td')
uspc = col[0].get_text().encode('ascii','ignore')
cpc1 = col[1].get_text().encode('ascii','ignore')
cpc2 = col[2].get_text().encode('ascii','ignore')
cpc3 = col[3].get_text().encode('ascii','ignore')
print uspc + ',' + cpc1 + ',' + cpc2 + ',' + cpc3 + '\n'
#file.write(record)
#file.close()
CODE I'm running:
import urllib
from bs4 import BeautifulSoup
#file_name = "/users/ripple/uspc-cpc.txt"
#file = open(file_name,"w")
i=125
while i==125:
url = "http://www.uspto.gov/web/patents/classification/cpc/html/us" + str(i) + "tocpc.html"
print 'Grabbing from: ' + url + '\n'
i += 1
#get the table data from the page
data = urllib.urlopen(url).read()
#send to beautiful soup
soup = BeautifulSoup(data)
table = soup.find("table", width='80%')
for tr in table.findAll('tr')[2:]:
col = tr.findAll('td')
uspc = col[0].get_text().encode('ascii','ignore').replace(" ","")
cpc1 = col[1].get_text().encode('ascii','ignore').replace(" ","")
cpc2 = col[2].get_text().encode('ascii','ignore').replace(" ","")
cpc3 = col[3].get_text().encode('ascii','ignore').replace(" ","").replace("more...", "")
record = uspc + ',' + cpc1 + ',' + cpc2 + ',' + cpc3 + '\n'
print record
#file.write(record)
#file.close()
You are using tr as a loop variable, but refer to row instead in the loop. If you had row defined before it'll probably produce confusing results.
for tr in table.findAll('tr')[2:]:
col = tr.findAll('td')
works for me:
125/1,B 28D 1/00,B 28D 1/221,E 01C 23/081,B 28D 1/005,B 28D 1/06more...
125/2,B 23Q 35/10,B 22C 9/18,B 23B 5/162,B 23D 63/18,B 24B 53/07more...
125/3,B 28D 1/18,B 28D 1/003,B 28D 1/048,B 28D 1/181,B 24B 7/22more...
etc.
Related
I am trying to save results in .csv but a receive the follow message and I have no idea how to fix that:
f.write(linha_csv)
ValueError: I/O operation on closed file.
Code Bellow:
import requests
from bs4 import BeautifulSoup
import csv
from csv import reader, writer
url_base = "https://lista.mercadolivre.com.br/"
soup = BeautifulSoup(requests.get(url_base + produto_nome).content,
"html.parser")
produtos = soup.findAll('div', attrs =
{'class': 'andes-card andes-card--flat andes-card--default ui-
search-result ui-search-result--core andes-card--padding-default'}
)
with open
(r'Lista_Precos_MercadoLivre.csv','a',encoding='utf8',newline='')
as f:
fieldnames = ['Produto','Link do Produto','Preco']
dw = csv.DictWriter(f,delimiter=';',fieldnames=fieldnames)
dw.writeheader()
i = 1
while True:
for tag in soup:
titulo = soup.find('h2', attrs={'class': 'ui-search-
item__title'})
print(i, tag.text)
print(i,'Título do Produto:', titulo.text)
print(i,'Link do Produto:', link['href'])
next_link = soup.select_one( "a.andes-pagination__link:-soup-
contains(Seguinte)"
)
if not next_link: break
linha_csv = titulo.text + ';' + link['href'] + ';' + "R$" +
real.text + "," + centavos.text + '\n'
f.write(linha_csv)
Cause indentation in your question is not set correct, it may caused by that fact. Moving the writing part into your for-loop should fix the issue:
for tag in soup.select('li.ui-search-layout__item'):
linha_csv = tag.h2.text + ';' + tag.a['href'] + ';' + tag.select_one('.price-tag-amount').text + '\n'
f.write(linha_csv)
Example
import requests, csv
from bs4 import BeautifulSoup
url_base = "https://lista.mercadolivre.com.br/"
query = "vinho"
soup = BeautifulSoup(requests.get(url_base + query).content, "html.parser")
with open (r'Lista_Precos_MercadoLivre.csv','a',encoding='utf8',newline='') as f:
fieldnames = ['Produto','Link do Produto','Preco']
dw = csv.DictWriter(f,delimiter=';',fieldnames=fieldnames)
dw.writeheader()
while True:
for tag in soup.select('li.ui-search-layout__item'):
linha_csv = tag.h2.text + ';' + tag.a['href'] + ';' + tag.select_one('.price-tag-amount').text + '\n'
f.write(linha_csv)
next_link = soup.select_one( "a.andes-pagination__link:-soup-contains(Seguinte)")
if not next_link:
break
soup = BeautifulSoup(requests.get(next_link["href"]).content, "html.parser")
Having an issue with bs4 when reading second value in array within a for loop. Below I will paste the code.
However, when I use line #19, I receive no errors. When I swap it out for the entire array (line #18), It errors out when it attempts to gather the second value. Note that the second value in the array is the same value as line #19.
import requests
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
SmartLiving_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=Smart%20Living&selectedFacets=Brand%7CSmart%20Living&sortBy="
IEL_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=IEL&selectedFacets=Brand%7CIts%20Exciting%20Lighting&sortBy="
TD_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=two%20dogs&selectedFacets=Brand%7CTwo%20Dogs%20Designs&sortBy="
Headers = "Description, URL, Price \n"
text_file = open("HayneedlePrices.csv", "w")
text_file.write(Headers)
text_file.close()
URL_Array = [SmartLiving_IDS, IEL_IDS, TD_IDS]
#URL_Array = [IEL_IDS]
for URL in URL_Array:
print("\n" + "Loading New URL:" "\n" + URL + "\n" + "\n")
uClient = uReq(URL)
page_html = uClient.read()
uClient.close()
soup = soup(page_html, "html.parser")
Containers = soup.findAll("div", {"product-card__container___1U2Sb"})
for Container in Containers:
Title = Container.div.img["alt"]
Product_URL = Container.a["href"]
Price_Container = Container.findAll("div", {"class":"product-card__productInfo___30YSc body no-underline txt-black"})[0].findAll("span", {"style":"font-size:20px"})
Price_Dollars = Price_Container[0].get_text()
Price_Cents = Price_Container[1].get_text()
print("\n" + "#####################################################################################################################################################################################################" + "\n")
# print(" Container: " + "\n" + str(Container))
# print("\n" + "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" + "\n")
print(" Description: " + str(Title))
print(" Product URL: " + str(Product_URL))
print(" Price: " + str(Price_Dollars) + str(Price_Cents))
print("\n" + "#####################################################################################################################################################################################################" + "\n")
text_file = open("HayneedlePrices.csv", "a")
text_file.write(str(Title) + ", " + str(Product_URL) + ", " + str(Price_Dollars) + str(Price_Cents) + "\n")
text_file.close()
print("Information gathered and Saved from URL Successfully.")
print("Looking for Next URL..")
print("No Additional URLs to Gather. Process Completed.")
The problem is that you import BeautifulSoup as soup and also define a variable soup = soup(page_html, "html.parser") with the same name!
I refactored your code a bit, let me know if it works as expected!
import csv
import requests
from bs4 import BeautifulSoup
smart_living_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=Smart%20Living&selectedFacets=Brand%7CSmart%20Living&sortBy="
IEL_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=IEL&selectedFacets=Brand%7CIts%20Exciting%20Lighting&sortBy="
TD_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=two%20dogs&selectedFacets=Brand%7CTwo%20Dogs%20Designs&sortBy="
site_URLs = [smart_living_IDS, IEL_IDS, TD_IDS]
sess = requests.Session()
prod_data = []
for curr_URL in site_URLs:
req = sess.get(url=curr_URL)
soup = BeautifulSoup(req.content, "lxml")
containers = soup.find_all("div", {"product-card__container___1U2Sb"})
for curr_container in containers:
prod_title = curr_container.div.img["alt"]
prod_URL = curr_container.a["href"]
price_container = curr_container.find(
"div",
{"class": "product-card__productInfo___30YSc body no-underline txt-black"},
)
dollars_elem = price_container.find("span", {"class": "main-price-dollars"})
cents_elem = dollars_elem.find_next("span")
prod_price = dollars_elem.get_text() + cents_elem.get_text()
prod_price = float(prod_price[1:])
prod_data.append((prod_title, prod_URL, prod_price))
CSV_headers = ("title", "URL", "price")
with open("../out/hayneedle_prices.csv", "w", newline="") as file_out:
writer = csv.writer(file_out)
writer.writerow(CSV_headers)
writer.writerows(prod_data)
I tested it by repeating the current URL list 10 times, it took longer than I was anticipating. There are certainly improvements to be made, I might rewrite it to use lxml in the next few days, and multiprocessing might also be a good option. It all depends on how you're using this, of course :)
I am new to BS4 and python.
For a project i am trying to get some real estate data.
i made my code so that is get two lists.
my challege is to combine te data in the output.
can any one help me please?
ty
ps: any tips on more efficiënt code are welkom.
from selenium import webdriver
from bs4 import BeautifulSoup
#open('output.csv', 'w').close()
import re
import time
import requests
from itertools import chain
from pandas import DataFrame
import csv
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(5)
#input('Press Enter after bypassing Captcha')
#print(url)
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
#print(inside)
for huis in info:
#locatie = huis.find('div')
#locatie = ' '.join(locatie.get_text(separator='\r\n', strip=True).split()[:-1])
#locatie = huis.find('h2')
#locatie = ' '.join(locatie.get_text(separator='\r\n', strip=True).split())
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
#sep by newline, strip whitespace, then split to get the last 3 elements to cut out, then rejoin
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
l1 = ('{},{},{},{}'.format(street, address, price, pricetag))
#print('{},{},{},{}'.format(street, address, price, pricetag))
out = open('output.csv', 'w')
saveFile = open('output.csv', 'a')
saveFile.write(street + "," + address + "," + price + "," + pricetag + '\n')
#print (list1)
for items in inside:
href = items.get('href')
#print (href)
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
#print (tr)
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
#print('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
saveFile = open('output.csv', 'a')
saveFile.write(soort+ "," + bouwjaar+ "," + woonoppervlakte + "," + inhoud + "," + perceel + '\n')
saveFile.close()
#output = list(chain(list1,list2))
#print (output)
page += 1
#output = list(chain(list1,list2))
#print (output)
#kenmerken = inside.find_all ('a', {'class': 'href'})
#print (href)
#print (details)
#print('{},{},{},{}'.format(street, address, price, pricetag))
#saveFile = open('jaap.csv', 'a')
#saveFile.write(street + "," + address + "," + price + "," + pricetag + '\n')
#saveFile.close()
jaap_spider(1)
Right now your code doesn't actually seem to make two lists. But asuming that you would make a list of lists for l1 out of for huis in info: and a list of lists l2 from for items in inside:, what you could do to combine two lists of lists is: outputlist = [a + b for a, b in zip(l1, l2)].
I incorporated that, plus a conversion to a Pandas DataFrame and an export to csv in the adapted code below:
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
#browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser = webdriver.Chrome(r'C:\Users\NLNIEH\.spyder-py3\chromedriver.exe')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(5)
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
jaap_spider(1)
You can use csv for writing list in csv file.
import csv
def write_list_in_file(filepath, output):
with open(filepath, 'a') as outtsv:
tuple_writer = csv.writer(outtsv, delimiter=',')
tuple_writer.writerow(output)
I would like to know how to export my results from crawling into multiple csv files for each different city that I have crawled. Somehow I´m running into walls, do not get a proper way to sort it out.
That is my code:
import requests
from bs4 import BeautifulSoup
import csv
user_agent = {'User-agent': 'Chrome/43.0.2357.124'}
output_file= open("TA.csv", "w", newline='')
RegionIDArray = [187147,187323,186338]
dict = {187147: 'Paris', 187323: 'Berlin', 186338: 'London'}
already_printed = set()
for reg in RegionIDArray:
for page in range(1,700,30):
r = requests.get("https://www.tripadvisor.de/Attractions-c47-g" + str(reg) + "-oa" + str(page) + ".html")
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class": "element_wrap"})
for item in g_data:
header = item.find_all("div", {"class": "property_title"})
item = (header[0].text.strip())
if item not in already_printed:
already_printed.add(item)
print("POI: " + str(item) + " | " + "Location: " + str(dict[reg]))
writer = csv.writer(output_file)
csv_fields = ['POI', 'Locaton']
if g_data:
writer.writerow([str(item), str(dict[reg])])
My goal would be that I get three sperate CSV files for Paris, Berlin and London instead of getting all the results in one big csv file.
Could you guys help me out? Thanks for your feedback:)
I did some minor modifications to your code. To make files for each locale, I moved the out_file name inside the loop.
Note, that I don't have time now, the very last line is a hack to ignore unicode errors -- it just skips trying to output a line with a non ascii character. Thas isn't good. Maybe someone can fix that part?
import requests
from bs4 import BeautifulSoup
import csv
user_agent = {'User-agent': 'Chrome/43.0.2357.124'}
RegionIDArray = {187147: 'Paris', 187323: 'Berlin', 186338: 'London'}
already_printed = set()
for reg in RegionIDArray:
output_file= open("TA" + str(reg) + ".csv", "w")
for page in range(1,700,30):
r = requests.get("https://www.tripadvisor.de/Attractions-c47-g" + str(reg) + "-oa" + str(page) + ".html")
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class": "element_wrap"})
for item in g_data:
header = item.find_all("div", {"class": "property_title"})
item = (header[0].text.strip())
if item not in already_printed:
already_printed.add(item)
# print("POI: " + str(item) + " | " + "Location: " + str(RegionIDArray[reg]))
writer = csv.writer(output_file)
csv_fields = ['POI', 'Locaton']
if g_data:
try:
writer.writerow([str(item), str(RegionIDArray[reg])])
except:
pass
I'm having a bit of trouble automatically scraping data in a table from a Wikipedia article. First I was getting an encoding error. I specified UTF-8 and the error went away, but the scraped data doesn't display a lot of the characters correctly. You will be able to tell from the code that I am a complete newbie:
from bs4 import BeautifulSoup
import urllib2
wiki = "http://en.wikipedia.org/wiki/Anderson_Silva"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
Result = ""
Record = ""
Opponent = ""
Method = ""
Event = ""
Date = ""
Round = ""
Time = ""
Location = ""
Notes = ""
table = soup.find("table", { "class" : "wikitable sortable" })
f = open('output.csv', 'w')
for row in table.findAll("tr"):
cells = row.findAll("td")
#For each "tr", assign each "td" to a variable.
if len(cells) == 10:
Result = cells[0].find(text=True)
Record = cells[1].find(text=True)
Opponent = cells[2].find(text=True)
Method = cells[3].find(text=True)
Event = cells[4].find(text=True)
Date = cells[5].find(text=True)
Round = cells[6].find(text=True)
Time = cells[7].find(text=True)
Location = cells[8].find(text=True)
Notes = cells[9].find(text=True)
write_to_file = Result + "," + Record + "," + Opponent + "," + Method + "," + Event + "," + Date + "," + Round + "," + Time + "," + Location + "\n"
write_to_unicode = write_to_file.encode('utf-8')
print write_to_unicode
f.write(write_to_unicode)
f.close()
As pswaminathan pointed out, using the csv module will help greatly. Here is how I do it:
table = soup.find('table', {'class': 'wikitable sortable'})
with open('out2.csv', 'w') as f:
csvwriter = csv.writer(f)
for row in table.findAll('tr'):
cells = [c.text.encode('utf-8') for c in row.findAll('td')]
if len(cells) == 10:
csvwriter.writerow(cells)
Discussion
Using the csv module, I created a csvwriter object connected to my output file.
By using the with command, I don't need to worry about closing the output file after done: it will be closed after the with block.
In my code, cells is a list of UTF8-encoded text extracted from the td tags within a tr tag.
I used the construct c.text, which is more concise than c.find(text=True).