save data as new line but in a single cell lxml python

save data as new line but in a single cell lxml python - python

i want the data like this...
"Basic jersey
Does what it says on the tin
Main: 100% Cotton."
in a single cell but i'm getting the data like this...
"Basic jerseyDoes what it says on the tinMain: 100% Cotton."
THIS IS THE HTML
<div class="about-me">
<h4>ABOUT ME</h4>
<span><div>Basic jersey</div><div>Does what it says on the tin</div><br>Main: 100% Cotton.</span>
</div>
THIS IS MY CODE
from selenium import webdriver
from lxml import html
import pandas as pd
import collections, os
from bs4 import BeautifulSoup
def Save_to_Csv(data):
filename = 'data.csv'
df = pd.DataFrame(data)
df.set_index('Title', drop=True, inplace=True)
if os.path.isfile(filename):
with open(filename,'a') as f:
df.to_csv(f, mode='a', sep=",", header=False, encoding='utf-8')
else:
df.to_csv(filename, sep=",", encoding='utf-8')
with open('urls.txt', 'r') as f:
links = [link.strip() for link in f.readlines()]
driver = webdriver.Chrome()
for urls in links:
global image
driver.get(urls)
source = driver.page_source
tree = html.fromstring(source)
data = BeautifulSoup(source, 'html.parser')
imgtag = data.find_all('li', attrs={'class':'image-thumbnail'})
image = []
for imgsrc in imgtag:
image.append(imgsrc.img['src'].replace('?$S$&wid=40&fit=constrain', '?$XXL$&wid=513&fit=constrain'))
title = tree.xpath('string(.//div/h1)')
price = tree.xpath('string(.//span[#class="current-price"])')
sku = tree.xpath('string(.//div[#class="product-code"]/span)')
aboutme = tree.xpath(('string(.//div[#class="about-me"]/span)'))
foundings = collections.OrderedDict()
foundings['Title'] = [title]
foundings['Price'] = [price]
foundings['Product_Code'] = [sku]
foundings['Abouy_Me'] = [aboutme]
foundings['Image'] = [image]
Save_to_Csv(foundings)
print title, price, sku, aboutme, image
driver.close()

Using the HTML you have given, you can solve this using the stripped_strings generator as follows:
from bs4 import BeautifulSoup
html = """
<div class="about-me">
<h4>ABOUT ME</h4>
<span><div>Basic jersey</div><div>Does what it says on the tin</div><br>Main: 100% Cotton.</span>
</div>"""
soup = BeautifulSoup(html, "html.parser")
print('\n'.join(soup.span.stripped_strings))
This would get each component in a stripped list and then join them together with a newline:
Basic jersey
Does what it says on the tin
Main: 100% Cotton.

Related

Want to Scrap each category individual but either it scraping data in single alphabet form or in a paragraph form

I want to extract Name & Position, Education, Contact number and email all in different column of csv but when I extract it either it is a single block per alphabet or a single column per paragraph(if I list it).Here is the code:
import requests
from bs4 import BeautifulSoup
from csv import writer
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
page = soup.find_all('p')
for i in page:
i = i.text
with open('page.csv', 'a', encoding = 'utf8', newline='') as f:
thewriter = writer(f)
thewriter.writerow(i)

You can use regex to pull out what you need:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
content = soup.find('div', {'id':'divContent'})
p_list = content.find_all('p')
rows = []
for p in p_list:
string = p.text
text = re.search('(^.*) (Education: )(.*)( Contact).*(\d{3}-\d{3}-\d{4})\s*([a-zA-z1-9].*#[\w].*\.[\w].*)', string).groups()
name = text[0]
edu = text[2]
phone = text[4]
email = text[5]
row = {
'name':name,
'education':edu,
'phone':phone,
'email':email}
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv('page.csv', index=False)

I can't get information with bs4

I'm trying scrape the title, contact information (phone) and webpage from this url:
https://partnerportal.fortinet.com/directory/search?l=Spain&p=1
&p=1 —This is the page. There are 92.
This is my code. I cannot get anything in the print output.
import datetime
import requests
from bs4 import BeautifulSoup
import csv
filename = "fichero" + datetime.datetime.now().strftime("%d-%m-%Y")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Nombre Empresa", "Direccion Empresa", "Telefono Empresa"])
for i in range(1,3):
r = requests.get('https://partnerportal.fortinet.com/directory/search?l=Spain&p='+format(i))
soup = BeautifulSoup(r.text, "html.parser")
array_title = soup.select('div.panel panel-default div.col-sm-10 h3')
array_address = soup.select('div.panel panel-default p.locator-partner-info')
array_webpage = soup.find_all('a', class_='locator-parter-site', text=True)
for iterator in range(0, len(array_title)):
title = array_title[iterator].text.strip()
for iterator2 in range(0, len(array_address)):
address = array_address[iterator2].text.strip()
print(title)
print(address)

instead of this
array_title = soup.select('div.panel panel-default div.col-sm-10 h3')
array_address = soup.select('div.panel panel-default p.locator-partner-info')
Try This
array_title = soup.select('div.panel-default div.col-sm-10 h3')
array_address = soup.select('div.panel-default p.locator-partner-info')
and you are printing title in address loop which prints only the last assigned value i.e., Tiws. Better print in title loop only to see the correct result.

To get the data from the website:
url = "https://partnerportal.fortinet.com/directory/search?l=Spain&p=1"
html = bs4.BeautifulSoup(requests.get(url).text, 'lxml')
rows = html.find('div', {'class':'row row-results'})
results = [{
'id': row.find('div', {'class':'col-sm-10'})\
.find('h3').getText(),
'phone':row.find('div', {'class':'partner-info-box'})\
.find('p').getText().split('Phone: ')[1].split('\n')[0],
'url': row.find('div', {'class':'partner-info-box'})\
.find('a').get('href')
} for row in html.find('div', {'class':'row row-results'})\
.find_all('div', {'class':'col-sm-12'})]
In order to save the .json to .csv
import pandas as pd
import datetime
filename = "fichero" + datetime.datetime.now().strftime("%d-%m-%Y")+".csv"
pd.DataFrame.from_dict(results).to_csv(filename, index=False)

Finding <caption class="table-title">

so I have written a script to scrape tables from a website and saves these to an Excel sheet:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pandas import ExcelWriter
import os.path
path = "C:...."
url= 'https://zoek.officielebekendmakingen.nl/kst-35570-2.html'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
tables_df = pd.read_html(url, attrs = {'class': 'kio2 portrait'})
tables = soup.find_all('table', class_="kio2 portrait")
titles = []
for table in tables:
print(table)
title = table.find_all("caption", class_="table-title")
titles.append(title)
titles = []
writer = pd.ExcelWriter('output.xlsx')
for i, df in enumerate(tables_df, 1):
df.to_excel(writer, index=True,sheet_name=f'sheetName_{i}')
writer.save()
Which works, but now I want to find all titles of these table so I can give each sheet this title. For example, the first table has the following text of which I am interested:
<table cellpadding="0" cellspacing="0" class="kio2 portrait" summary="Tabel 1.1 Budgettaire kerngegevens"><caption class="table-title">Tabel 1.1 Budgettaire kerngegevens</caption>
Now I want to scrape the part between <caption class="table-title"> and </caption>. Or, which is also a possibility, use the summary element. How can I achieve this? I have tried it within the code but I do not find anything yet.

Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pandas import ExcelWriter
url = "https://zoek.officielebekendmakingen.nl/kst-35570-2.html"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
writer = pd.ExcelWriter("output.xlsx")
for i, table in enumerate(soup.find_all("table", class_="kio2 portrait"), 1):
df = pd.read_html(str(table))[0]
caption = table.get("summary", "").replace(":", "").strip()
# some tables doesn't contain summary, so make generic sheet name:
if not caption:
caption = f"table {i}"
df.to_excel(writer, sheet_name=caption)
writer.save()
This creates output.xlsx with 185 sheets (at least opening it in my Libreoffice):

problem in Webscraping by Python/ Python prints classes first in CSV then prints the information

I want to get info from the website by Web scraping with python(I learn it now), but it prints classes (which I got the info from) first in CSV then prints the information which I want. I saw the Youtube video many times, and I wrote the same code but it doesn't happen like the problem which I got. Is there anyone kan HELP me?
This is an image link for CSV to show you how It looks when I click on RUN
Code:
import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
Job_titles = []
Company_names = []
Location_names = []
Job_skills = []
Links = []
result = requests.get("https://wuzzuf.net/search/jobs/?q=python&a=hpb")
src = result.content
soup = BeautifulSoup(src, "lxml")
Job_titles = soup.find_all('h2', {"class":"css-m604qf"})
Company_names = soup.find_all('a', {"class":"css-17s97q8"})
Location_names = soup.find_all('span', {"class":"css-5wys0k"})
Job_skills = soup.find_all("div", {'class':"css-y4udm8"})
for i in range(len(Company_names)):
Job_titles.append(Job_titles[i].text)
Company_names.append(Company_names[i].text)
Location_names.append(Location_names[i].text)
Job_skills.append(Job_skills[i].text)
file_list = [Job_titles, Company_names, Location_names, Job_skills,]
exported = zip_longest(*file_list)
with open("C:/Users/Saleh saleh/Documents/jobtest.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["Job titles", "Company names", "Location", "Skills", "Links"])
wr.writerows(exported)

To get information from the site, you can use following example:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://wuzzuf.net/search/jobs/?q=python&a=hpb"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
with open("data.csv", "w") as f_in:
writer = csv.writer(f_in)
writer.writerow(
["Job titles", "Company names", "Location", "Skills", "Links"]
)
for title in soup.select("h2 > a"):
company_name = title.find_next("a")
location = company_name.find_next("span")
info = location.find_next("div", {"class": None})
writer.writerow(
[
title.text,
company_name.text,
location.text,
",".join(
a.text.replace("·", "").strip() for a in info.select("a")
),
title["href"],
]
)
Creates data.csv (screenshot from LibreOffice):

BeautifulSoup with Table

I'm Web Scraping on Beautiful Soup and I am getting an error on line 13: for row in table.findAll('tr').
Its coming up an error on the cmd. Hope someone could help.
import csv
import requests
from bs4 import BeautifulSoup
url='http://www.dublincity.ie/dublintraffic/carparks.htm'
response = requests.get(url)
html= response.content
soup=BeautifulSoup(html)
table=soup.find('tbody', attrs={'id' :'itemsBody'})
list_of_rows=[]
for row in table.findAll('tr'):
list_of_cells=[]
for cell in row.findAll('td'):
text = cell.text.replace(' ','')
list_of_cells.append(text)
list_of_cells.append(list_of_cells)
outfile= open("./carpark.csv", "wb")
writer=csv.writer(outfile)
writer.writerows(["location","spaces"])
writer.writerows(list_of_rows)

If you wanna stick to BeautifulSoup then you can fetch and write the content using its xml parser along with csv.DictWriter(). Check out the implementation:
import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)

You could retrieve the data as an xml doc and then parse. This is just an example of part of process you could tailor.
import requests
from xml.etree import ElementTree
import pandas as pd
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
xml_data = requests.get(url).content
tree = ElementTree.fromstring(xml_data)
parking = []
for child in tree:
for nextChild in child:
parking.append([child.tag ,nextChild.attrib['name'],nextChild.attrib['spaces']])
df = pd.DataFrame(parking)
print(df)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

save data as new line but in a single cell lxml python - python

Related

Want to Scrap each category individual but either it scraping data in single alphabet form or in a paragraph form

I can't get information with bs4

Finding <caption class="table-title">

problem in Webscraping by Python/ Python prints classes first in CSV then prints the information

BeautifulSoup with Table

Categories

Resources