Python: save same-title files in the folder - python

Code:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import json
from os import listdir
res = requests.get('http://www.abcde.com/frontend/SearchParts')
soup = BeautifulSoup(res.text,"lxml")
href = [ a["href"] for a in soup.findAll("a", {"id" : re.compile("parts_img.*")})]
b1 =[]
for url in href:
b1.append("http://www.abcde.com"+url)
#print (b1)
b=[]
for i in range(len(b1)):
res2 = requests.get(b1[i]).text
soup2 = BeautifulSoup(res2,"lxml")
url_n=soup2.find('',rel = 'next')['href']
url_n=("http://www.abcde.com"+url_n)
#print(url_n)
b.append(b1[i])
b.append(url_n)
while True:
res3=requests.get(url_n).text
soup3 = BeautifulSoup(res3,"lxml")
try:
url_n=soup3.find('',rel = 'next')['href']
except TypeError:
break
if url_n:
url_n=("http://www.abcde.com"+url_n)
#print(url_n)
b.append(url_n)
all=[]
for url in b:
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".article-title"):
all.append(urljoin('http://www.abcde.com',item['href']))
for urls in all:
re=requests.get(urls)
soup=BeautifulSoup(re.text.encode('utf-8'), "html.parser")
title_tag = soup.select_one('.page_article_title')
list=[]
for tag in soup.select('.page_article_content'):
list.append(tag.text)
list=([c.replace('\n', '') for c in list])
list=([c.replace('\r', '') for c in list])
list=([c.replace('\t', '') for c in list])
list=([c.replace(u'\xa0', u' ') for c in list])
list= (', '.join(list))
fruit_tag = soup.select_one('.authorlink')
fruit_final=None
if fruit_tag:
fruit_final= fruit_tag.text
else:
fruit_final= fruit_tag
keys=soup.findAll('div', style="font-size:1.2em;")
keys_final=None
list2=[]
if keys:
for key in keys:
list2.append(key.text)
list2=([c.replace('\n', '') for c in list2])
list2=([c.replace(' ', '') for c in list2])
list2= (', '.join(list2))
key_final=list2
else:
key_final=keys
if key_final==[]:
key_final=None
##################edit part####################################
data={
"Title" : title_tag.text,
"Registration": fruit_final,
"Keywords": key_final,
"Article": list
}
save_path= "C:/json/"
files=listdir(save_path)
file_name = save_path+'%s.json' % title_tag.text
with open(file_name, 'w',encoding='UTF-8') as f:
if file_name not in files:
file_d = json.dumps(data,ensure_ascii=False)
f.write(file_d)
else:
file_name = save_path +'%s_1.json' % title_tag.text
file_d = json.dumps(data,ensure_ascii=False)
f.write(file_d)
I scraped a web page and extract every article's title as title_tag.text. I found that some articles have same titles but different urls/contents, so I still need to save them in my directory. Now I know how to check it if two titles are the same, I can just name one as original and another with original_1. But what if I need to save 4 files which have same titles? How to do it in this case? Thanks in advance!

Related

extract names in custom <h2> but It is extracted many times beautifulsoup

I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:
The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])

None Type object while using Beautiful soup to extract text from a url

I was trying to extrct the text from 3 urls given in code file (2 commented). I tried to extrct the text from those url and store onto a text file. While my code is working for most of the urls but it is giving error with few urls. I have given examples of 3 of them. I used Beautiful Soup.
The code file is
import requests
import io
from bs4 import BeautifulSoup
#url = 'https://foundersfund.com/our_team'
#url = 'https://a16z.com/about/team'
url = 'https://ctinnovations.com/learn-about-connecti...'
res = requests.get(url)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
soup.body.a.text
soup.body.p.b
text = soup.find_all(text=True)
tag = soup.find('div', id='I1_sys_txt')
print(tag.get_text() if tag else "<none found>")
output = ''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
# there may be more elements you don't want, such as "style", etc.
]
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
outFile = '<filepath>//<filename.txt>'
with io.open(outFile,"w", encoding="utf-8") as textFile:
textFile.write(output)
print(type(output))
print(output)
#print('output'.join(soup.stripped_strings))
This gives me an error as folllows :
AttributeError Traceback (most recent call last)
<ipython-input-22-47bbd55e7171> in <module>
10 html_page = res.content
11 soup = BeautifulSoup(html_page, 'html.parser')
---> 12 soup.body.a.text
13 soup.body.p.b
14 text = soup.find_all(text=True)
AttributeError: 'NoneType' object has no attribute 'text'
I appreciate it if anyone cn wlk me through my mistake and get the correct code apllicable to all urls.
Thanks
Salil
Thanks everyone
I could solve the issue with the following set of codes
# EXTRACTING THE URL TEXTS
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import io
L = range(len(url_list))
for i in L:
url = (url_list[i][0])
sotretxt = (vcfrm_list[i][0])
print (url)
print (sotretxt)
print ("---------------------------------------------------------------------------------------")
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(url,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
text = soup.find_all(text=True)
output = ''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
# there may be more elements you don't want, such as "style", etc.
]
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
path= 'C://Users//ubana//OneDrive//ANIL JOSHI//PROJECTS//NAMED ENTITY RECOGNITION//XLXURLTXT'
name_of_file = storetxt # input("Enter the FileName : ")
outFile = os.path.join(path, name_of_file + ".txt")
with io.open(outFile,"w", encoding="utf-8") as textFile:
textFile.write(output)
print ("DONE")
#print('output'.join(soup.stripped_strings))

Trouble with my output when webscraping website

I would like to scrape all the name of the company on all the links right here :
https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html
In each of those links, there are several companies, like here :
https://www.bilansgratuits.fr/classement/6420Z/default.html
My goal is to have all those companies for all the links.
Here's my script so far :
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
def clean_text(text):
text = tokenizer.tokenize(text)
final_text = ' '.join( [w for w in text] )
return final_text
url = 'https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html'
links = []
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
links = [a['href'] for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
names = []
root_url = 'https://www.bilansgratuits.fr/'
urls = [ '{root}{i}'.format(root=root_url, i=i) for i in links ]
for url in urls[:3]:
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
try:
name = [a.text for a in soup.find("div", {"class": "donnees"}).find_all('a', href=True)]
except:
name = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
names.append(name)
for i in range(0,3):
rx = re.compile(r'^\s+$')
names[i] = [item.split() for item in names[i] if not rx.match(item)]
data = pd.DataFrame({
'names' : names
})
data['names']= data['names'].apply(str)
data['names']= data['names'].apply(lambda x : clean_text(x))
print(data)
#data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')
I have this output :
But that's not what I want, I would like to have for each row, a name of a company.
Like that :
And so on for all the names.
Is this want you want?
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html"
html = requests.get(url).text
follow_urls = [
f"https://www.bilansgratuits.fr{anchor['href']}" for anchor
in BeautifulSoup(html, "html.parser").select(".titreElementAnnuaire a")
]
data = []
for follow_url in follow_urls:
print(f"Fetching: {follow_url}")
css_selector = ".titreElementAnnuaire a" if "6411Z" in follow_url else ".classementTop .blocRaisonSociale > a"
company_urls = BeautifulSoup(
requests.get(follow_url).text,
"html.parser",
).select(css_selector)
data.extend(
[
[
" ".join(anchor.getText(strip=True).split()),
f"https://www.bilansgratuits.fr{anchor['href']}",
] for anchor in company_urls
]
)
pd.DataFrame(data).to_csv("your_data.csv", index=False, header=["Company", "URL"])
print("Done!")
Output: a 345 entries in a .csv file:
Here's my final answer !
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import itertools
url = 'https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html'
links = []
results = requests.get(url)
#time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
links = [a['href'] for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
secteur = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
secteurs = []
URLS = []
names = []
root_url = 'https://www.bilansgratuits.fr/'
urls = [ '{root}{i}'.format(root=root_url, i=i) for i in links ]
for url, secteur in zip(urls[:3], secteur[:3]):
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
try:
name = [a.text for a in soup.find("div", {"class": "donnees"}).find_all('a', href=True)]
for i in name:
URLS.append(url)
for i in name:
secteurs.append(secteur)
except:
name = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
for i in name:
URLS.append(url)
for i in name:
secteurs.append(secteur)
names.append(name)
for i in range(0,3):
rx = re.compile(r'^\s+$')
names[i] = [item.split() for item in names[i] if not rx.match(item)]
res = []
for list in names:
for lis in list:
res.append(' '.join([w for w in lis]))
data = pd.DataFrame({
'names' : res,
'URL' : URLS,
'Secteur' : secteurs
})
data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')

Python BeautifulSoup webscraping

I hope sombody can help me with the next issue
I would like to get the data in one row this is what i get now in my csv :
9200000083649863,bol.com retourdeals
9200000083649863,"41,75"
9200000083649863,ITidee
9200000083649863,"45,88"
9200000083649863,Bol.com
9200000083649863,"47,99"
What i would like :
9200000083649863,bol.com retourdeals ,41,75
9200000083649863,ITidee, 45,88
9200000083649863,Bol.com 47,99
this is the code
def haalprijs_verkoper(ean, Urll):
URL = Urll
ean = ean
page = requests.get(URL)
csvfile = open('/home/filoor1/webscrape/book1.csv', 'a')
csvwriter = csv.writer(csvfile)
soup = ""
results = ""
soup = BeautifulSoup(page.text, 'html.parser')
results = soup.find(id='offers')
naam = results.find_all("p, strong")
prijs = results.find_all("span")
# print(results.prettify())
counter = 0
for tag in results.find_all([ 'strong' , 'span']):
# print(tag.text)
aa = tag.text
aa = aa.replace("Nieuw", "")
aa = aa.replace(" ", "")
aa = aa.replace("\n","")
aa = aa.replace("''", "aaaaaa")
aa = aa.strip(' "')
aa = aa.strip('"')
if aa != "":
counter += 0.5
# print(ean, aa, counter)
csvwriter.writerow([ean, aa])
haalprijs_verkoper(9200000083649863, 'https://www.bol.com/nl/prijsoverzicht/tp-link-tl-sg1005p-switch/9200000083649863/?filter=all&sort=price&sortOrder=asc')
Thank you
You can use this example to scrape the data and save the correct CSV:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://www.bol.com/nl/prijsoverzicht/tp-link-tl-sg1005p-switch/9200000083649863/?filter=all&sort=price&sortOrder=asc'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
ean = '9200000083649863'
all_data = []
for s, p in zip(soup.select('p.nosp > strong'),
soup.select('span.product-prices__currency.product-prices__bol-price')):
all_data.append([ean, s.get_text(strip=True), p.get_text(strip=True)])
with open('data.csv', 'w') as f_out:
writer = csv.writer(f_out)
writer.writerows(all_data)
Saves this data.csv:
9200000083649863,bol.com retourdeals,"41,75"
9200000083649863,ITidee,"45,88"
9200000083649863,Bol.com,"47,99"
9200000083649863,4Allshop,"49,70"
9200000083649863,codima,"51,69"
9200000083649863,PlazaSale.nl,"53,40"
9200000083649863,Stock Sellers B.V.,"53,67"
9200000083649863,Art & Craft,"54,27"
9200000083649863,ORM Wholesale,"54,38"
9200000083649863,DutchDo B.V.,"55,92"

Skip Over Item if element doesn't exist on page

I have a script that loops through multiple pages of a website and I want to skip over or add a blank space for the item that might not be on certain pages. For example, there are some pages that do not contain a description about the book. When I run into one of those pages I get an attribute error. My script below loops through the first two pages with no problem, but when it hits the third page it stops.
Here is the traceback
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/beautifulsoup4-4.6.0-py3.6.egg/bs4/element.py", line 737, in __getattr__ AttributeError: 'NavigableString' object has no attribute 'text'
How can I fix this? Here is my script:
from bs4 import BeautifulSoup as soup
import requests
import json
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
#html parsing
page_soup = soup(response.content, "html5lib")
#grabs info for each textbook
containers = page_soup.findAll("div",{"class":"LongDescription"})
author = page_soup.select("p")
about = page_soup.find("div",{"id":"AboutBook"})
for container in containers:
item = {}
item['type'] = "Textbook"
item['title'] = container.find("div",{"class":"twothird"}).h1.text
item['author'] = author[3].get_text(separator=', ')
if item['author'] == " ":
item['author'] = "University of Minnesota Libraries Publishing"
item['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
if not container.find(string="Publisher: "):
item['publisher_url'] = item['publisher'] = ""
else:
item['publisher'] = container.find(text="Publisher: ").nextSibling.text
item['publisher_url'] = container.find(text="Publisher: ").nextSibling['href']
item['source'] = "Open Textbook Library"
if not about.h2.nextSibling.nextSibling.nextSibling:
item['description'] = ""
else:
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text
item['base_url'] = "https://open.umn.edu/opentextbooks/"
if container.find("p",{"class":"Badge-Condition"}).a:
item['license'] = container.find("p",{"class":"Badge-Condition"}).a.text
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license'] = ''
if container.find("p",{"class":"Badge-Condition"}).a:
item['license_url'] = container.find("p",{"class":"Badge-Condition"}).a["href"]
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license_url'] = ''
if container.find("div",{"class":"twothird"}).p:
item['review'] = container.find("div",{"class":"twothird"}).p.text
else:
item['review'] = ''
if item['review'].startswith('('):
item['review'] = item['review'].replace('(', '')
if item['review'].endswith(' reviews)'):
item['review'] = item['review'].replace(' reviews)', '')
if item['review'] > str(0):
item['review'] = "Reviewed Resource"
else:
item['review'] = ''
item['image_url'] = "https://open.umn.edu/opentextbooks/" + container.img["src"]
data.append(item) # add the item to the list
with open("./json/otl-1.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
I wouldn't recommend parsing the description with item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text, that's too much specific. I came up with this code:
from bs4 import BeautifulSoup as soup
import requests
import json
from pprint import pprint
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
page_soup = soup(response.content, "lxml")
data = {}
title, author, description = page_soup.select('h1')[0].text, \
page_soup.select('h1 ~ p')[3].get_text(', '), \
'\n'.join(p.text.strip() for p in page_soup.select('div#AboutBook > p') if p.text.strip())
data['type'] = "Textbook"
data['title'] = title
data['author'] = author if author.strip() else "University of Minnesota Libraries Publishing"
data['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
data['source'] = "Open Textbook Library"
data['description'] = description
pprint(data)
# with open("./json/otl-1.json", "w") as writeJSON:
# json.dump(data, writeJSON, ensure_ascii=False)
Prints:
{'author': 'University of Minnesota Libraries Publishing',
'description': 'This book is intended for an undergraduate or MBA level '
'Financial Accounting course. It covers the standard topics in '
'a standard sequence, utilizing the Socratic method of asking '
'and answering questions.',
'link': 'https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=4',
'source': 'Open Textbook Library',
'title': 'Financial Accounting',
'type': 'Textbook'}
...and so on (for each book)
Wherever you are getting the AttributeError you can use the following code:
Try:
your code here
except AttributeError:
pass or other codes

Categories