Extract Information out of html - python

how can I extract the information of the appended html and save in a text file the following:
Paragraph-ID \t TokenID \t TokenCoordinates \t TokenContent
So, for example, the first lines should look like this:
T102633 1 109,18,110,18 IV
T102634 1 527,29,139,16 Seit
...
I'd like to use python. At the moment, I have the following:
root = lxml.html.parse('html-file').getroot()
tables = root.cssselect('table.main')
tables = root.xpath('//table[#class="main" and not(ancestor::table[#class="main"])]')
for elem in root.xpath("//span[#class='finereader']"):
text = (elem.text or "") + (elem.tail or "")
if elem.getprevious() is not None: # If there's a previous node
previous = elem.getprevious()
previous.tail = (previous.tail or "") + text # append to its tail
else:
parent = elem.getparent() # Otherwise use the parent
parent.text = (parent.text or "") + text # and append to its text
elem.getparent().remove(elem)
txt = []
txt += ([lxml.etree.tostring(t, method="html", encoding="utf-8") for t in tables])
text = "\n".join(el for el in txt)
output.write(text.decode("utf-8"))
This gives me something like this:
[:T102633-1
coord="109,18,110,18":]IV[:/T102633-1:]
Now, it's clear that I could use the string-find-method to extract the information I want. But is there no more elegant solution? With ".attrib" or something like that?
Thanks for any help!
Here, one can find the html: http://tinyurl.com/qjvsp4n

This code using BeautifulSoup gives all the spans you are interested in:
from bs4 import BeautifulSoup
html_file = open('html_file')
soup = BeautifulSoup(html_file)
table = soup.find('table', attrs={'class':'main'})
# The first two tr's dont seem to contain the info you need,
# so get rid of them
rows = table.find_all('tr')[2:]
for row in rows:
data = row.find_all('td')[1]
span_element = data.find_all('span')
for ele in span_element:
print ele.text
Once you have the data in the format [:T102639-3 coord="186,15,224,18":]L.[:/T102639-3:], use the python regex module to get the content.
import re
pattern = re.compile('\[:(.*):\](.*)\[:\/(.*):\]')
data = "[:T102639-3 coord="186,15,224,18":]L.[:/T102639-3:]"
res = re.search(pattern, data)
# res.group(1).split()[0] then gives 'T102639-3'
# res.group(1).split()[1] gives coord="186,15,224,18"
# res.group(2) gives 'L.'

Related

Python doesn't recognize two equal string

i have a strange problem with String in python. I have two list and I have to found equal name in the two strings. The second list is a readline() from a file opened before.
This is my code:
import requests
import sys
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
###VARIABILI###
###FUNZIONI###
def get_mysellers():
s = open("sellers.txt" , 'r')
sellers = []
for line in s:
sellers.append(line)
return sellers
def get_onlinesellers():
online_sellers = []
table_body = soup.findAll('span', {"class": "d-flex has-content-centered mr-1"}) #questa è la tabella contenente la lista dei venditori con annesse informazioni
i = 0
for child in table_body:
#print(child.string)
online_sellers.append(child.string)
i = i + 1
return online_sellers
def list_of_choosen(selleronline, sellercheck ):
choosen = []
print(range(len(sellercheck)))
for i in range(len(selleronline)):
for j in range(len(sellercheck)):
if(selleronline[i] == sellercheck[j]):
choosen.append(sellercheck[j])
return choosen
###MAIN###
page = urlopen("https://www.cardmarket.com/it/YuGiOh/Products/Singles/Chaos-Impact/Draco-Berserker-of-the-Tenyi")
soup = bs(page, 'html.parser')
online_sellers = get_onlinesellers()
sellers = get_mysellers()
chosen = list_of_choosen(online_sellers, sellers)
print(chosen)
sellers is like this ['L-Air1993\n', 'prova \n', 'CardsellerVienna\n', 'Terrycloth\n']. I think the problem is "\n" but if I print a single element in sellers I obtain the name without "\n"
Thank you very much
When you print 'L-Air1993\n' it will look like it has just the name. The newline just adds a newline at the end but it's hard to see if nothing comes after it.
To remove all the new lines from your list, try this:
sellers_no_newlines = [x.strip() for x in sellers]
And then compare the online sellers list to sellers_no_newlines. Hopefully that solves your problem.
You can use the method strip in the function get_mysellers to remove the '\n' from the string.
def get_mysellers():
s = open("sellers.txt" , 'r')
sellers = []
for line in s:
Line=line.strip('\n')
sellers.append(Line)

Problems with text data-cleaning in python

I am working on a program that crawls Internet articles using the web crawling method.The program is started by entering the start and end pages of the website.
This program works in the following order.
web-crawling of articles information(title, sort, time, contents)
Remove special characters
Only nouns are extracted.
The problem maybe occurs lies in extracting nouns in the process of cleaning the content of the article. It works until the stage before noun extraction.
The error message is as follows
ValueError: Length of passed values is 4, index implies 5
To solve this problem, I coded using a method of adding DataFrame append.
But it doesn't solve the problem.
Use konlypy method(Korean morpheme analyzer)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
from konlpy.tag import Okt
from pandas import Series
i = input('Start page? : ')
k = input('End page? : ')
startpage = int(i)
lastpage = int(k)
count = int(i)
# Definition of text cleaning function
def text_cleaning(text):
hangul = re.compile('[^ㄱ-ㅣ가-힣]+')
result = hangul.sub(' ', text)
return result
# Definition of nouns extraction function
def get_nouns(x):
nouns_tagger = Okt()
nouns = nouns_tagger.nouns(x)
nouns = [noun for noun in nouns if len(noun)>1]
nouns = [noun for noun in nouns if noun not in stopwords]
return nouns
# dataframe formation
columns = ['Title', 'Sort', 'Datetime', 'Article']
news_info = pd.DataFrame(columns=columns)
idx = 0
Web-site page loop
while startpage<lastpage + 1:
url = f'http://www.koscaj.com/news/articleList.html?page={startpage}&total=72698&box_idxno=&sc_section_code=S1N2&view_type=sm'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all(class_='list-titles')
print(f'-----{count}page result-----')
# Articles loop in the web-site page
for link in links:
news_url = "http://www.koscaj.com"+link.find('a')['href']
news_link = urllib.request.urlopen(news_url).read()
soup2 = BeautifulSoup(news_link, 'html.parser')
# an article's title
title = soup2.find('div', {'class':'article-head-title'})
if title:
title = soup2.find('div', {'class':'article-head-title'}).text
else:
title = ''
# an article's sort
sorts = soup2.find('nav', {'class':'article-head-nav auto-marbtm-10'})
try:
sorts2 = sorts.find_all('a')
sort = sorts2[2].text
except:
sort =''
# an article's time
date = soup2.find('div',{'class':'info-text'})
try:
datetime = date.find('i', {'class':'fa fa-clock-o fa-fw'}).parent.text.strip()
datetime = datetime.replace("승인", "")
except:
datetime = ''
# an article's content
article = soup2.find('div', {'id':'article-view-content-div'})
if article:
article = soup2.find('div', {'id':'article-view-content-div'}).text
article = article.replace("\n", "")
article = article.replace("\r", "")
article = article.replace("\t", "")
article = article.replace("[전문건설신문] koscaj#kosca.or.kr", "")
article = article.replace("저작권자 © 대한전문건설신문 무단전재 및 재배포 금지", "")
article = article.replace("전문건설신문", "")
article = article.replace("다른기사 보기", "")
else:
article = ''
# Remove special characters
news_info['Title'] = news_info['Title'].apply(lambda x: text_cleaning(x))
news_info['Sort'] = news_info['Sort'].apply(lambda x: text_cleaning(x))
news_info['Article'] = news_info['Article'].apply(lambda x: text_cleaning(x))
So far, the program works without any problems. But if you see the program error message, it is indicated that the operation is not working because the input value and index are different.
Text data cleaning for extraction nouns
# Dataframe for storing after crawling individual articles
row = [title, sort, datetime, article]
series = pd.Series(row, index=news_info.columns)
news_info = news_info.append(series, ignore_index=True)
# Load Korean stopword dictionary file
path = "C:/Users/이바울/Desktop/이바울/코딩파일/stopwords-ko.txt"
with open(path, encoding = 'utf-8') as f:
stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]
news_info['Nouns'] = news_info['Article'].apply(lambda x: get_nouns(x))
startpage += 1
count += 1
news_info.to_excel(f'processing{lastpage-int(1)}-{startpage-int(1)}.xlsx')
print('Complete')
After setting the existing 4 columns in the Pandas DataFrame, the append was used to add the column extracted as a noun as the 5th column. I know this method adds a column regardless of the index name. And if you look at the image link at the bottom, as a result, the first article is crawled and shows the results. From the next article, it does not work and an error occurs.
enter image description here(Program error result)
enter link description here(Korean stopwords dictionary)
I solves the problem.
It depends on the location of the code in the for loop statement.
I've been able to fix the problem as a result of continuing to reposition the problematic areas except for the code that worked before.
I solved the problem by applying backspace only twice in the code below.
news_info['Nouns'] = news_info['Article'].apply(lambda x: get_nouns(x))

Creating multiple text files with unique file names from scraped data

I took an introductory course in Python this semester and am now trying to do a project. However, I don't really know what code I should write to create multiple .txt files of which the title will be different for each file.
I scraped all the terms and definitions from the website http://www.hogwartsishere.com/library/book/99/. Title of the .txt file should for example be 'Aconite.txt' and the content of the file should be the title and the definition. Every term with its definition can be found in a separate p-tag and the term itself is a b-tag withing the p-tag. Can I use this to write my code?
I suppose I will need to use a for-loop for this, but I don't really know where to start. I searched StackOverflow and found several solutions, but all of them contain code I am not familiar with and/or relate to another issue.
This is what I have so far:
#!/usr/bin/env/ python
import requests
import bs4
def download(url):
r = requests.get(url)
html = r.text
soup = bs4.BeautifulSoup(html, 'html.parser')
terms_definition = []
#for item in soup.find_all('p'): #beter definiëren
items = soup.find_all("div", {"class" : "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
#print(term)
if term.text is not 'None':
#print(term.text)
#print("\n")
term_split = term.text.split()
print(term_split)
if term.text != None and len(term.text) > 1:
if '-' in term.text.split():
print(term.text)
print('\n')
if item.find('p'):
terms_definition.append(item['p'])
print(terms_definition)
return terms_definition
def create_url(start, end):
list_url = []
base_url = 'http://www.hogwartsishere.com/library/book/99/chapter/'
for x in range(start, end):
list_url.append(base_url + str(x))
return list_url
def search_all_url(list_url):
for url in list_url:
download(url)
#write data into separate text files. Word in front of the dash should be title of the document, term and definition should be content of the text file
#all terms and definitions are in separate p-tags, title is a b-tag within the p-tag
def name_term
def text_files
path_write = os.path.join('data', name_term +'.txt') #'term' should be replaced by the scraped terms
with open(path_write, 'w') as f:
f.write()
#for loop? in front of dash = title / everything before and after dash = text (file content) / empty line = new file
if __name__ == '__main__':
download('http://www.hogwartsishere.com/library/book/99/chapter/1')
#list_url = create_url(1, 27)
#search_all_url(list_url)
Thanks in advance!
You can iterate over all pages (1-27) to get its content, then parse each page with bs4 and then save results to files:
import requests
import bs4
import re
for i in range(1, 27):
r = requests.get('http://www.hogwartsishere.com/library/book/99/chapter/{}/'.format(i)).text
soup = bs4.BeautifulSoup(r, 'html.parser')
items = soup.find_all("div", {"class": "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
title = re.match('^(.*) -', term.text).group(1).replace('/', '-')
with open(title + '.txt', 'w', encoding='utf-8') as f:
f.write(term.text)
Output files:

How to go through all items and than save them in a dictionary key

I want to load automatically a code from website.
I have a list with some names and want to go through every item. Go through the first item, make request, open website, copy the code/number from HTML (text in span) and than save this result in dictionary and so on (for all items).
I read from csv all lines and save them into a list.
After this I make request to load HTML from a website, search the company and read the numbers from span.
My code:
with open(test_f, 'r') as file:
rows = csv.reader(file,
delimiter=',',
quotechar='"')
data = [data for data in rows]
print(data)
url_part1 = "http://www.monetas.ch/htm/651/de/Firmen-Suchresultate.htm?Firmensuche="
url_enter_company = [data for data in rows]
url_last_part = "&CompanySearchSubmit=1"
firma_noga = []
for data in firma_noga:
search_noga = url_part1 + url_enter_company + url_last_part
r = requests.get(search_noga)
soup = BeautifulSoup(r.content, 'html.parser')
lii = soup.find_all("span")
# print all numbers that are in a span
numbers = [d.text for d in lii]
print("NOGA Codes: ")
I want to get in dictionary the result, where the key should be the company name (item in a list) and the value should be the number that I read from the span:
dict = {"firma1": "620100", "firma2": "262000, 465101"}
Can some one help me, I am new at web scraping and python, and don't know what I am doing wrong.
Split your string with regex and do your stuff depending on wether it is a number or not:
import re
for partial in re.split('([0-9]+)', myString):
try:
print(int(partial))
except:
print(partial + ' is not a number')
EDIT:
Well, myString is somewhat expected to be a string.
To get the text content of your spans as a string you should be able to use .text something like this:
spans = soup.find_all('span')
for span in spans:
myString = span.text #
for partial in re.split('([0-9]+)', myString):
try:
print(int(partial))
except:
print(partial + ' is not a number')
Abstracting from my requirements in comments I think somethinfg like this should work for you:
firma_noga = ['firma1', 'firma2', 'firma3'] #NOT EMPTY as in your code!
res_dict = {}
for data in firma_noga:
search_noga = url_part1 + url_enter_company + url_last_part
r = requests.get(search_noga)
soup = BeautifulSoup(r.content, 'html.parser')
lii = soup.find_all("span")
for l in lii:
if data not in res_dict:
res_dict[data] = [l]
else:
res_dict[data].append(l)
Obviously this will work obviously if firma-noga won't be empty like in your code; and all the rest (your) parsing logic should be valid as well.

Python HTML parsing script that takes array of URLs and outputs specific data about each of the URLs

I am trying to write an HTML parser in Python that takes as its input a URL or list of URLs and outputs specific data about each of those URLs in the format:
URL: data1: data2
The data points can be found at the exact same HTML node in each of the URLs. They are consistently between the same starting tags and ending tags. If anyone out there would like to help an amateur python programmer get the job done, it would be greatly appreciated. Extra points if you can come up with a way to output the information that can be easily copied and pasted into an excel document for subsequent data analysis!
For example, lets say I would like to output the view count for a particular YouTube video. For the URL http://www.youtube.com/watch?v=QOdW1OuZ1U0, the view count is around 3.6 million. For all YouTube videos, this number is found in the following format within the page's source:
<span class="watch-view-count ">
3,595,057
</span>
Fortunately, these exact tags are found only once on a particular YouTube video's page. These starting and ending tags can be inputted into the program or built-in and modified when necessary. The output of the program would be:
http://www.youtube.com/watch?v=QOdW1OuZ1U0: 3,595,057 (or 3595057).
import urllib2
from bs4 import BeautifulSoup
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
soup = BeautifulSoup(data)
span = soup.find('span', attrs={'class':'watch-view-count'})
print '{}:{}'.format(url, span.text)
If you do not want to use BeautifulSoup, you can use re:
import urllib2
import re
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
pattern = re.compile('<span class="watch-view-count.*?([\d,]+).*?</span>', re.DOTALL)
r = pattern.search(data)
print '{}:{}'.format(url, r.group(1))
As for the outputs, I think you can store them in a csv file.
I prefer HTMLParser over re for this type of task. However, HTMLParser can be a bit tricky. I use immutable objects to store data... I'm sure this this the wrong way of doing it. But its worked with several projects for me in the past.
import urllib2
from HTMLParser import HTMLParser
import csv
position = []
results = [""]
class hp(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'span' and ('class', 'watch-view-count ') in attrs:
position.append('bingo')
def handle_endtag(self, tag):
if tag == 'span' and 'bingo' in position:
position.remove('bingo')
def handle_data(self, data):
if 'bingo' in position:
results[0] += " " + data.strip() + " "
my_pages = ["http://www.youtube.com/watch?v=QOdW1OuZ1U0"]
data = []
for url in my_pages:
response = urllib2.urlopen(url)
page = str(response.read())
parser = hp()
parser.feed(page)
data.append(results[0])
# reinitialize immutiable objects
position = []
results = [""]
index = 0
with open('/path/to/test.csv', 'wb') as f:
writer = csv.writer(f)
header = ['url', 'output']
writer.writerow(header)
for d in data:
row = [my_pages[index], data[index]]
writer.writerow(row)
index += 1
Then just open /path/to/test.csv in Excel

Categories