I have a python code that scraps data from a website. This code works fine, but I want to change the URL source to a text list on my desktop. The urls in my text file are each one in a line.
How do you suggest I should read this file and loop through urls?
Thanks in advance for your time.
import csv
import requests
from bs4 import BeautifulSoup
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'link', 'price'])
for x in range(0, 70):
try:
urls = 'https://www.meisamatr.com/fa/product/cat/2-%D8%A2%D8%B1%D8%A7%DB%8C%D8%B4%DB%8C.html&pagesize[]=24&order[]=new&stock[]=1&page[]=' + str(x + 1) + '&ajax=ok?_=1561559181560'
source = requests.get(urls).text
soup = BeautifulSoup(source, 'lxml')
print('Page: %s' % (x + 1))
for figcaption in soup.find_all('figcaption'):
price = figcaption.find('span', {'class': 'new_price'}).text.strip()
name = figcaption.find('a', class_='title').text
link = figcaption.find('a', class_='title')['href']
print('%s\n%s\n%s' % (price, name, link))
csv_writer.writerow([name, link, price])
except:
break
csv_file.close()
If you don't have too many URLs in that text file (urls.txt in my example) the following snippet should do what you want.
import requests
# read all URLs at once
with open("urls.txt", "r") as f:
urls = f.read().splitlines()
# and loop over them
for url in urls:
try:
source = requests.get(url).text
except Exception as e:
print(e)
break
Lets suppose you have a file called input.txt which looks like this
url1
url2
url3
url4
.
.
.
Then we will simply open this input.txt file and then split by a newline('\n'). This should give us a list of urls.
like
['url1','url2','url3']
You can then simply loop through it and crawl the webpages.
Here is a
# crawler.py
import csv
import requests
from bs4 import BeautifulSoup
with open('input.txt','r') as f:
urls = f.read().split() # here we get a list of urls
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'link', 'price'])
for url in urls:
try:
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for figcaption in soup.find_all('figcaption'):
price = figcaption.find('span', {'class': 'new_price'}).text.strip()
name = figcaption.find('a', class_='title').text
link = figcaption.find('a', class_='title')['href']
print('%s\n%s\n%s' % (price, name, link))
csv_writer.writerow([name, link, price])
except Exception as e:
print(e)
break
csv_file.close()
Related
I'm trying to write a list to a csv file such that the it comes out looking like this
I'm sure I'm not using the CSV library correctly since it prints each character of just the first link to the file Here's my code:
for t in terms:
fields = ["Search Term", "URL"]
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
for item in soup.find_all("item"):
link= str(item)
i = link.find("<link/>")
j = link.find("<guid")
links = link[i+7:j]
with open("urls.csv", "w") as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(links)
Any help would be so appreciated. Thanks!!
Use xml parser when creating the soup:
import csv
import requests
from bs4 import BeautifulSoup
terms = ["refrigerator", "kitchen sink"]
with open("urls.csv", "w") as f_out:
writer = csv.writer(f_out)
writer.writerow(["Search Term", "URL"])
for t in terms:
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
print(f"Getting {url}")
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, "xml")
for item in soup.find_all("link"):
writer.writerow([t, item.get_text(strip=True)])
Creates urls.csv (screenshot from LibreOffice):
I'm quite new to web scraping using BeautifuSoup and I'm attempting to loop through multiple pages and write everything into a CSV file.
Here my current code:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
urls = []
url = "https://www.newegg.com/Black-Friday-Deals/EventSaleStore/ID-10475/"
for page in range (1,7):
pageurl = 'Page-{1}'.format(url,page)
urls.append(pageurl)
page = urlopen(url)
html = page.read().decode("utf-8")
page.close()
page_soup = soup(html, "html.parser")
containers = page_soup.findAll("div",{"class" : "item-container"})
print(len(containers))
filename = "BlackFridayNewegg.csv"
f = open(filename, "w")
headers = "Product, Previous price, Current price"
f.write(headers)
for container in containers:
rating_container = page_soup.findAll("span", {"class" : "item-rating-num"})
rating = rating_container[0].text.strip()
title_container = container.findAll("a",{"class":"item-title"})
title = title_container[0].text.strip()
prev_price_container = container.findAll("li",{"class" : "price-was"})
prev_price = prev_price_container[0].text
current_price_container = container.findAll("li", {"class" : "price-current"})
current_price = current_price_container[0].text
print("Product Name: " +title)
print("Previous Price: " + prev_price)
print("Current Price: "+current_price)
result = title.replace(",","")+ "," + prev_price +"," +current_price +"\n"
f.write(result)
f.close()
My code is working properly and will display text from multiple pages, but it won't write it all into the file. Any reason as to why this is happening?
f = open(filename, "w") clears the file. You need f = open(filename, "a")
Need to append (add) specific text to all output lines and ultimately make a url . More explanation after code.
from bs4 import BeautifulSoup
import requests
source = requests.get('https://dota2.gamepedia.com/Category:Counters').text
soup = BeautifulSoup(source, 'lxml')
link = soup.find('div', class_="mw-category")
heroes_names = []
savefile = open('file.txt', 'w')
for link in link:
link = link.text
# print(link)
heroes = link.split("\n")
# print(heroes)
for i in range(1,len(heroes)):
# print(heroes)
heroname = heroes[i].split("/")[0]
print(heroname)
heroes_names.append(heroname)
savefile.write(heroname + "\n")
# for hero_name in heroes_names:
# print(hero_name)
savefile.close()
required output:
Abaddon/counters
Alchemist/counters
and so on
final requirement :
https://dota2.gamepedia.com/Abaddon/Counters
https://dota2.gamepedia.com/Alchemist/Counters
and so on
So you already have all your hero names in heroes_names as strings right ? Then you can create the urls like this:
url_list = []
for hero_name in heroes_names:
print(hero_name + "/counters") # Prints out HERO/counters
url = "https://dota2.gamepedia.com/%s/Counters" % hero_name
url_list.append(url)
url_list then contains all urls for heroes in your heroes_names list.
I need your help with this question :
I have a working python script here :
from bs4 import BeautifulSoup
import requests
import csv
with open('urls.csv', 'r') as csvFile, open('results.csv', 'w', newline='') as results:
reader = csv.reader(csvFile, delimiter=';')
writer = csv.writer(results)
for row in reader:
# get the url
url = row[0]
# fetch content from server
html = requests.get(url).content
# soup fetched content
soup = BeautifulSoup(html, 'html.parser')
divTag = soup.find("div", {"class": "productsPicture"})
if divTag:
tags = divTag.findAll("a")
else:
continue
for tag in tags:
res = tag.get('href')
if res != None:
writer.writerow([res])
Source: https://stackoverflow.com/a/50328564/6653461
Basically why I need to change is how to keep the consistency of the input and output, line by line. See below:
Idea behind all this, is to get/print the redirected link, if working link - print the link, if not, print error link or so
urls.csv sample
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E705Y-0193; - valid
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E703Y-0193; - non valid
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E702Y-4589; - valid
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E706Y-9093; - non valid
You just need to add some more items to the list you are writing with the csv.writerow() function:
from bs4 import BeautifulSoup
import requests
import csv
with open('urls.csv', 'r') as csvFile, open('results.csv', 'w', newline='') as results:
reader = csv.reader(csvFile)
writer = csv.writer(results)
for row in reader:
# get the url
for url in row:
url = url.strip()
# Skip any empty URLs
if len(url):
print(url)
# fetch content from server
try:
html = requests.get(url).content
except requests.exceptions.ConnectionError as e:
writer.writerow([url, '', 'bad url'])
continue
except requests.exceptions.MissingSchema as e:
writer.writerow([url, '', 'missing http...'])
continue
# soup fetched content
soup = BeautifulSoup(html, 'html.parser')
divTag = soup.find("div", {"class": "productsPicture"})
if divTag:
# Return all 'a' tags that contain an href
for a in divTag.find_all("a", href=True):
url_sub = a['href']
# Test that link is valid
try:
r = requests.get(url_sub)
writer.writerow([url, url_sub, 'ok'])
except requests.exceptions.ConnectionError as e:
writer.writerow([url, url_sub, 'bad link'])
else:
writer.writerow([url, '', 'no results'])
Giving you:
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E705Y-0193,https://www.tennis-point.com/asics-gel-game-6-all-court-shoe-men-white-silver-02013802643000.html,ok
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E703Y-0193,https://www.tennis-point.com/asics-gel-game-6-all-court-shoe-men-white-silver-02013802643000.html,no results
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E702Y-4589,https://www.tennis-point.com/asics-gel-resolution-7-clay-court-shoe-men-blue-lime-02014202831000.html,ok
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E706Y-9093,https://www.tennis-point.com/asics-gel-resolution-7-clay-court-shoe-men-blue-lime-02014202831000.html,no results
Exception handling can catch the case where the URL from the CSV file is invalid. You can also test that the URL returned from the link on the page is valid. The third column could then give you a status, i.e. ok, bad url, no results or bad link.
It assumes that all columns in your CSV file contain URLs that need to be tested.
import requests
from bs4 import BeautifulSoup
import csv
from urlparse import urljoin
import urllib2
base_url = 'http://www.baseball-reference.com/' # base url for concatenation
data = requests.get("http://www.baseball-reference.com/teams/BAL/2014-schedule-scores.shtml") #website for scraping
soup = BeautifulSoup(data.content)
b=5
for link in soup.find_all('a'):
if not link.has_attr('href'):
continue
if link.get_text() != 'boxscore':
continue
url = base_url + link['href']
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
# Scores
table = soup.find('table', attrs={'id': 'BaltimoreOriolespitching'})
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '')
list_of_cells.append(text)
for list in list_of_cells:
with open('test1.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(list)
I am trying to write the info scraped to a csv so that each piece of information has its own cell. The more I play with the code I either get an indentation error or the first row prints to a csv and thats it.
IndentationError: expected an indented block
I think the first thing to consider is moving opening the file and creating the CSV writer outside the loop. I think you're overwriting the CSV file ('w') on each pass through the for loop. So try this:
with open('test1.csv', 'w', newline='') as fp:
csvw = csv.writer(fp, delimiter=',')
for link in soup.find_all('a'):
if not link.has_attr('href'):
continue
if link.get_text() != 'boxscore':
continue
url = base_url + link['href']
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
# Scores
table = soup.find('table', attrs={'id': 'BaltimoreOriolespitching'})
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '')
list_of_cells.append(text)
for list in list_of_cells:
csvw.writerows(list)