i have a strange problem with String in python. I have two list and I have to found equal name in the two strings. The second list is a readline() from a file opened before.
This is my code:
import requests
import sys
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
###VARIABILI###
###FUNZIONI###
def get_mysellers():
s = open("sellers.txt" , 'r')
sellers = []
for line in s:
sellers.append(line)
return sellers
def get_onlinesellers():
online_sellers = []
table_body = soup.findAll('span', {"class": "d-flex has-content-centered mr-1"}) #questa è la tabella contenente la lista dei venditori con annesse informazioni
i = 0
for child in table_body:
#print(child.string)
online_sellers.append(child.string)
i = i + 1
return online_sellers
def list_of_choosen(selleronline, sellercheck ):
choosen = []
print(range(len(sellercheck)))
for i in range(len(selleronline)):
for j in range(len(sellercheck)):
if(selleronline[i] == sellercheck[j]):
choosen.append(sellercheck[j])
return choosen
###MAIN###
page = urlopen("https://www.cardmarket.com/it/YuGiOh/Products/Singles/Chaos-Impact/Draco-Berserker-of-the-Tenyi")
soup = bs(page, 'html.parser')
online_sellers = get_onlinesellers()
sellers = get_mysellers()
chosen = list_of_choosen(online_sellers, sellers)
print(chosen)
sellers is like this ['L-Air1993\n', 'prova \n', 'CardsellerVienna\n', 'Terrycloth\n']. I think the problem is "\n" but if I print a single element in sellers I obtain the name without "\n"
Thank you very much
When you print 'L-Air1993\n' it will look like it has just the name. The newline just adds a newline at the end but it's hard to see if nothing comes after it.
To remove all the new lines from your list, try this:
sellers_no_newlines = [x.strip() for x in sellers]
And then compare the online sellers list to sellers_no_newlines. Hopefully that solves your problem.
You can use the method strip in the function get_mysellers to remove the '\n' from the string.
def get_mysellers():
s = open("sellers.txt" , 'r')
sellers = []
for line in s:
Line=line.strip('\n')
sellers.append(Line)
Related
I am working on a program that crawls Internet articles using the web crawling method.The program is started by entering the start and end pages of the website.
This program works in the following order.
web-crawling of articles information(title, sort, time, contents)
Remove special characters
Only nouns are extracted.
The problem maybe occurs lies in extracting nouns in the process of cleaning the content of the article. It works until the stage before noun extraction.
The error message is as follows
ValueError: Length of passed values is 4, index implies 5
To solve this problem, I coded using a method of adding DataFrame append.
But it doesn't solve the problem.
Use konlypy method(Korean morpheme analyzer)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
from konlpy.tag import Okt
from pandas import Series
i = input('Start page? : ')
k = input('End page? : ')
startpage = int(i)
lastpage = int(k)
count = int(i)
# Definition of text cleaning function
def text_cleaning(text):
hangul = re.compile('[^ㄱ-ㅣ가-힣]+')
result = hangul.sub(' ', text)
return result
# Definition of nouns extraction function
def get_nouns(x):
nouns_tagger = Okt()
nouns = nouns_tagger.nouns(x)
nouns = [noun for noun in nouns if len(noun)>1]
nouns = [noun for noun in nouns if noun not in stopwords]
return nouns
# dataframe formation
columns = ['Title', 'Sort', 'Datetime', 'Article']
news_info = pd.DataFrame(columns=columns)
idx = 0
Web-site page loop
while startpage<lastpage + 1:
url = f'http://www.koscaj.com/news/articleList.html?page={startpage}&total=72698&box_idxno=&sc_section_code=S1N2&view_type=sm'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all(class_='list-titles')
print(f'-----{count}page result-----')
# Articles loop in the web-site page
for link in links:
news_url = "http://www.koscaj.com"+link.find('a')['href']
news_link = urllib.request.urlopen(news_url).read()
soup2 = BeautifulSoup(news_link, 'html.parser')
# an article's title
title = soup2.find('div', {'class':'article-head-title'})
if title:
title = soup2.find('div', {'class':'article-head-title'}).text
else:
title = ''
# an article's sort
sorts = soup2.find('nav', {'class':'article-head-nav auto-marbtm-10'})
try:
sorts2 = sorts.find_all('a')
sort = sorts2[2].text
except:
sort =''
# an article's time
date = soup2.find('div',{'class':'info-text'})
try:
datetime = date.find('i', {'class':'fa fa-clock-o fa-fw'}).parent.text.strip()
datetime = datetime.replace("승인", "")
except:
datetime = ''
# an article's content
article = soup2.find('div', {'id':'article-view-content-div'})
if article:
article = soup2.find('div', {'id':'article-view-content-div'}).text
article = article.replace("\n", "")
article = article.replace("\r", "")
article = article.replace("\t", "")
article = article.replace("[전문건설신문] koscaj#kosca.or.kr", "")
article = article.replace("저작권자 © 대한전문건설신문 무단전재 및 재배포 금지", "")
article = article.replace("전문건설신문", "")
article = article.replace("다른기사 보기", "")
else:
article = ''
# Remove special characters
news_info['Title'] = news_info['Title'].apply(lambda x: text_cleaning(x))
news_info['Sort'] = news_info['Sort'].apply(lambda x: text_cleaning(x))
news_info['Article'] = news_info['Article'].apply(lambda x: text_cleaning(x))
So far, the program works without any problems. But if you see the program error message, it is indicated that the operation is not working because the input value and index are different.
Text data cleaning for extraction nouns
# Dataframe for storing after crawling individual articles
row = [title, sort, datetime, article]
series = pd.Series(row, index=news_info.columns)
news_info = news_info.append(series, ignore_index=True)
# Load Korean stopword dictionary file
path = "C:/Users/이바울/Desktop/이바울/코딩파일/stopwords-ko.txt"
with open(path, encoding = 'utf-8') as f:
stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]
news_info['Nouns'] = news_info['Article'].apply(lambda x: get_nouns(x))
startpage += 1
count += 1
news_info.to_excel(f'processing{lastpage-int(1)}-{startpage-int(1)}.xlsx')
print('Complete')
After setting the existing 4 columns in the Pandas DataFrame, the append was used to add the column extracted as a noun as the 5th column. I know this method adds a column regardless of the index name. And if you look at the image link at the bottom, as a result, the first article is crawled and shows the results. From the next article, it does not work and an error occurs.
enter image description here(Program error result)
enter link description here(Korean stopwords dictionary)
I solves the problem.
It depends on the location of the code in the for loop statement.
I've been able to fix the problem as a result of continuing to reposition the problematic areas except for the code that worked before.
I solved the problem by applying backspace only twice in the code below.
news_info['Nouns'] = news_info['Article'].apply(lambda x: get_nouns(x))
I took an introductory course in Python this semester and am now trying to do a project. However, I don't really know what code I should write to create multiple .txt files of which the title will be different for each file.
I scraped all the terms and definitions from the website http://www.hogwartsishere.com/library/book/99/. Title of the .txt file should for example be 'Aconite.txt' and the content of the file should be the title and the definition. Every term with its definition can be found in a separate p-tag and the term itself is a b-tag withing the p-tag. Can I use this to write my code?
I suppose I will need to use a for-loop for this, but I don't really know where to start. I searched StackOverflow and found several solutions, but all of them contain code I am not familiar with and/or relate to another issue.
This is what I have so far:
#!/usr/bin/env/ python
import requests
import bs4
def download(url):
r = requests.get(url)
html = r.text
soup = bs4.BeautifulSoup(html, 'html.parser')
terms_definition = []
#for item in soup.find_all('p'): #beter definiëren
items = soup.find_all("div", {"class" : "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
#print(term)
if term.text is not 'None':
#print(term.text)
#print("\n")
term_split = term.text.split()
print(term_split)
if term.text != None and len(term.text) > 1:
if '-' in term.text.split():
print(term.text)
print('\n')
if item.find('p'):
terms_definition.append(item['p'])
print(terms_definition)
return terms_definition
def create_url(start, end):
list_url = []
base_url = 'http://www.hogwartsishere.com/library/book/99/chapter/'
for x in range(start, end):
list_url.append(base_url + str(x))
return list_url
def search_all_url(list_url):
for url in list_url:
download(url)
#write data into separate text files. Word in front of the dash should be title of the document, term and definition should be content of the text file
#all terms and definitions are in separate p-tags, title is a b-tag within the p-tag
def name_term
def text_files
path_write = os.path.join('data', name_term +'.txt') #'term' should be replaced by the scraped terms
with open(path_write, 'w') as f:
f.write()
#for loop? in front of dash = title / everything before and after dash = text (file content) / empty line = new file
if __name__ == '__main__':
download('http://www.hogwartsishere.com/library/book/99/chapter/1')
#list_url = create_url(1, 27)
#search_all_url(list_url)
Thanks in advance!
You can iterate over all pages (1-27) to get its content, then parse each page with bs4 and then save results to files:
import requests
import bs4
import re
for i in range(1, 27):
r = requests.get('http://www.hogwartsishere.com/library/book/99/chapter/{}/'.format(i)).text
soup = bs4.BeautifulSoup(r, 'html.parser')
items = soup.find_all("div", {"class": "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
title = re.match('^(.*) -', term.text).group(1).replace('/', '-')
with open(title + '.txt', 'w', encoding='utf-8') as f:
f.write(term.text)
Output files:
Looking to return a full string after if there is a long dash ("―"), and if true, return everything before the first comma (","). How would I do this using Python with Regex?
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
request = requests.get('https://www.goodreads.com/quotes/tag/fun?page=1')
soup = BeautifulSoup(request.text, 'lxml')
# for loop
s = soup.find_all("div", class_="quoteText")[0].text
s = " ".join(s.split())
s[:s.index(",")]
s
Raw Output:
“That does it," said Jace. "I\'m going to get you a dictionary for Christmas this year.""Why?" Isabelle said."So you can look up \'fun.\' I\'m not sure you know what it means.” ― Cassandra Clare, City of Ashes //<![CDATA[ function submitShelfLink(unique_id, book_id, shelf_id, shelf_name, submit_form, exclusive) { var checkbox_id = \'shelf_name_\' + unique_id + \'_\' + shelf_id; var element = document.getElementById(checkbox_id) var checked = element.checked if (checked && exclusive) { // can\'t uncheck a radio by clicking it! return } if(document.getElementById("savingMessage")){ Element.show(\'savingMessage\') } var element_id = \'shelfInDropdownName_\' + unique_id + \'_\' + shelf_id; Element.upda
Desired Output:
“That does it," said Jace. "I\'m going to get you a dictionary for Christmas this year.""Why?" Isabelle said."So you can look up \'fun.\' I\'m not sure you know what it means.” ― Cassandra Clare
Here's one solution:
import re
s = 'adflakjd, fkljlkjdf ― Cassandra Clare, City of Ash, adflak'
x = re.findall('.*―.*?(?=,)', s)
print x
['adflakjd, fkljlkjdf ― Cassandra Clare']
I'm not sure I understand it properly, but I think you mean:
example_string = "part to return,example__text"
if example_string.count('__') > 0:
try:
result = re.search('(.*?)\,', example_string).group(0)
except:
result = None
print(result)
This prints 'part to return'
If you mean, the part of the string between the '__' and the ',' I would use:
example_string = "lala__part to return, lala"
try:
result = re.search('__(.*?)\,', example_string).group(0)
except:
result = None
print(result)
from bs4 import BeautifulSoup
from bs4.element import NavigableString
import requests
request = requests.get('https://www.goodreads.com/quotes/tag/fun?page=1')
soup = BeautifulSoup(request.text, 'html.parser')
# for loop
s = soup.find_all("div", class_="quoteText")[0]
text = ''
text += "".join([t.strip() for t in s.contents if type(t) == NavigableString])
for book_or_author_tag in s.find_all("a", class_ = "authorOrTitle"):
text += "\n" + book_or_author_tag.text.strip()
print(text)
The quote you want is split across the initial quoteText div, but calling text on it returns all that CDATA junk you're trying to remove with the regex.
By looping over every child of that div and checking whether it's a navigable string type, we can extract only the actual text data you want. then tack on the author and book, and hopefully your regex becomes a lot simpler.
I want to load automatically a code from website.
I have a list with some names and want to go through every item. Go through the first item, make request, open website, copy the code/number from HTML (text in span) and than save this result in dictionary and so on (for all items).
I read from csv all lines and save them into a list.
After this I make request to load HTML from a website, search the company and read the numbers from span.
My code:
with open(test_f, 'r') as file:
rows = csv.reader(file,
delimiter=',',
quotechar='"')
data = [data for data in rows]
print(data)
url_part1 = "http://www.monetas.ch/htm/651/de/Firmen-Suchresultate.htm?Firmensuche="
url_enter_company = [data for data in rows]
url_last_part = "&CompanySearchSubmit=1"
firma_noga = []
for data in firma_noga:
search_noga = url_part1 + url_enter_company + url_last_part
r = requests.get(search_noga)
soup = BeautifulSoup(r.content, 'html.parser')
lii = soup.find_all("span")
# print all numbers that are in a span
numbers = [d.text for d in lii]
print("NOGA Codes: ")
I want to get in dictionary the result, where the key should be the company name (item in a list) and the value should be the number that I read from the span:
dict = {"firma1": "620100", "firma2": "262000, 465101"}
Can some one help me, I am new at web scraping and python, and don't know what I am doing wrong.
Split your string with regex and do your stuff depending on wether it is a number or not:
import re
for partial in re.split('([0-9]+)', myString):
try:
print(int(partial))
except:
print(partial + ' is not a number')
EDIT:
Well, myString is somewhat expected to be a string.
To get the text content of your spans as a string you should be able to use .text something like this:
spans = soup.find_all('span')
for span in spans:
myString = span.text #
for partial in re.split('([0-9]+)', myString):
try:
print(int(partial))
except:
print(partial + ' is not a number')
Abstracting from my requirements in comments I think somethinfg like this should work for you:
firma_noga = ['firma1', 'firma2', 'firma3'] #NOT EMPTY as in your code!
res_dict = {}
for data in firma_noga:
search_noga = url_part1 + url_enter_company + url_last_part
r = requests.get(search_noga)
soup = BeautifulSoup(r.content, 'html.parser')
lii = soup.find_all("span")
for l in lii:
if data not in res_dict:
res_dict[data] = [l]
else:
res_dict[data].append(l)
Obviously this will work obviously if firma-noga won't be empty like in your code; and all the rest (your) parsing logic should be valid as well.
how can I extract the information of the appended html and save in a text file the following:
Paragraph-ID \t TokenID \t TokenCoordinates \t TokenContent
So, for example, the first lines should look like this:
T102633 1 109,18,110,18 IV
T102634 1 527,29,139,16 Seit
...
I'd like to use python. At the moment, I have the following:
root = lxml.html.parse('html-file').getroot()
tables = root.cssselect('table.main')
tables = root.xpath('//table[#class="main" and not(ancestor::table[#class="main"])]')
for elem in root.xpath("//span[#class='finereader']"):
text = (elem.text or "") + (elem.tail or "")
if elem.getprevious() is not None: # If there's a previous node
previous = elem.getprevious()
previous.tail = (previous.tail or "") + text # append to its tail
else:
parent = elem.getparent() # Otherwise use the parent
parent.text = (parent.text or "") + text # and append to its text
elem.getparent().remove(elem)
txt = []
txt += ([lxml.etree.tostring(t, method="html", encoding="utf-8") for t in tables])
text = "\n".join(el for el in txt)
output.write(text.decode("utf-8"))
This gives me something like this:
[:T102633-1
coord="109,18,110,18":]IV[:/T102633-1:]
Now, it's clear that I could use the string-find-method to extract the information I want. But is there no more elegant solution? With ".attrib" or something like that?
Thanks for any help!
Here, one can find the html: http://tinyurl.com/qjvsp4n
This code using BeautifulSoup gives all the spans you are interested in:
from bs4 import BeautifulSoup
html_file = open('html_file')
soup = BeautifulSoup(html_file)
table = soup.find('table', attrs={'class':'main'})
# The first two tr's dont seem to contain the info you need,
# so get rid of them
rows = table.find_all('tr')[2:]
for row in rows:
data = row.find_all('td')[1]
span_element = data.find_all('span')
for ele in span_element:
print ele.text
Once you have the data in the format [:T102639-3 coord="186,15,224,18":]L.[:/T102639-3:], use the python regex module to get the content.
import re
pattern = re.compile('\[:(.*):\](.*)\[:\/(.*):\]')
data = "[:T102639-3 coord="186,15,224,18":]L.[:/T102639-3:]"
res = re.search(pattern, data)
# res.group(1).split()[0] then gives 'T102639-3'
# res.group(1).split()[1] gives coord="186,15,224,18"
# res.group(2) gives 'L.'