Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 3 years ago.
Improve this question
I am writing a function to get the length of words and count them. Inside this function, I have tried to get intersection of two dictionary but I couldn't reach one of them in properly.
My problem is in line 37 (before the last line), I couldn't reach di_Title which is in line 13 di_Title = dic()
I have tried global variable but it didn't work for me!
My function:
def Text_Analyze(Raw_Text):
Title_Length = []
Title_Dictionary = []
Article_Length = []
Article_Dictionary = []
Intersection_Dictionary = []
Title_Info = Raw_Text.PageTitle
for Each_Line in Title_Info:
Title = remove_punctuation(Each_Line)
Title = Title.lower()
Title = Title.split()
Title_Length.append(len(Title))
di_Title = dict()
for w_Title in Title:
Root_Title_Split = TurkishStemmer()
Root_Title_Word = Root_Title_Split.stem(w_Title)
if Root_Title_Word in di_Title:
di_Title[Root_Title_Word] = di_Title[Root_Title_Word] + 1
else:
di_Title[Root_Title_Word] = 1
Title_Dictionary.append(di_Title)
Article_Info = Raw_Text.PageArticle
for each_Line in Article_Info:
Article = remove_punctuation(each_Line)
Article = Article.lower()
Article = Article.split()
Article_Length.append(len(Article))
di_Article = dict()
for w_Article in Article:
root_Article_Split = TurkishStemmer()
root_Article_Word = root_Article_Split.stem(w_Article)
if root_Article_Word in di_Article:
di_Article[root_Article_Word] = di_Article[root_Article_Word] + 1
else:
di_Article[root_Article_Word] = 1
Article_Dictionary.append(di_Article)
Int_Word_Dic = intersect(di_Title, di_Article)
Intersection_Dictionary.append(Int_Word_Dic)
Variables declared inside a scope (in your case, in a loop) will be inaccessible outside the scope in which it is declared.
You can declare it at a higher scope (in your case, before the loop) and change its value depending on your needs.
I dont get your question but this might do it, as long as each title_info row matches with the same row in article_info
def Text_Analyze(Raw_Text):
Title_Length = []
Title_Dictionary = []
Article_Length = []
Article_Dictionary = []
Intersection_Dictionary = []
Title_Info = Raw_Text.PageTitle
for Each_Line in Title_Info:
Title = remove_punctuation(Each_Line)
Title = Title.lower()
Title = Title.split()
Title_Length.append(len(Title))
di_Title = dict()
for w_Title in Title:
Root_Title_Split = TurkishStemmer()
Root_Title_Word = Root_Title_Split.stem(w_Title)
if Root_Title_Word in di_Title:
di_Title[Root_Title_Word] = di_Title[Root_Title_Word] + 1
else:
di_Title[Root_Title_Word] = 1
Title_Dictionary.append(di_Title)
Article_Info = Raw_Text.PageArticle
for c,each_Line in enumerate(Article_Info):
Article = remove_punctuation(each_Line)
Article = Article.lower()
Article = Article.split()
Article_Length.append(len(Article))
di_Article = dict()
for w_Article in Article:
root_Article_Split = TurkishStemmer()
root_Article_Word = root_Article_Split.stem(w_Article)
if root_Article_Word in di_Article:
di_Article[root_Article_Word] = di_Article[root_Article_Word] + 1
else:
di_Article[root_Article_Word] = 1
Article_Dictionary.append(di_Article)
Int_Word_Dic = intersect(Title_Dictionary[c], di_Article)
Intersection_Dictionary.append(Int_Word_Dic)
Related
I am trying to scrape all the possible data from this webpage Gstaad 2017
Here is my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from selenium.webdriver.support.ui import Select
#Starts the driver and goes to our starting webpage
driver = webdriver.Chrome( "C:/Users/aldi/Downloads/chromedriver.exe")
driver.get('http://www.bvbinfo.com/Tournament.asp?ID=3294&Process=Matches')
#Imports HTML into python
page = requests.get('http://www.bvbinfo.com/Tournament.asp?ID=3294&Process=Matches')
soup = BeautifulSoup(driver.page_source, 'lxml')
stages = soup.find_all('div')
stages = driver.find_elements_by_class_name('clsTournBracketHeader')[-1].text
#TODO the first row (country quota matches) has no p tag and therefore it is not included in the data
rows = []
paragraphs = []
empty_paragraphs = []
for x in soup.find_all('p'):
if len(x.get_text(strip=True)) != 0:
paragraph = x.extract()
paragraphs.append(paragraph)
if len(x.get_text(strip=True)) == 0:
empty_paragraph = x.extract()
empty_paragraphs.append(empty_paragraph)
# players
home_team_player_1 = ''
home_team_player_2 = ''
away_team_player_1 = ''
away_team_player_2 = ''
for i in range(0, len(paragraphs)):
#round and satege of the competition
round_n= paragraphs[i].find('u').text
paragraph_rows = paragraphs[i].text.split('\n')[1:-1]
counter = 0
for j in range(0,len(paragraph_rows)):
#TODO tournament info, these can vary from tournament to tournament
tournament_info = soup.find('td', class_ = 'clsTournHeader').text.strip().split()
tournament_category = [' '.join(tournament_info[0 : 2])][0]
tournament_prize_money = tournament_info[2]
#TODO tournament city can also have two elements, not just one
tournament_city = tournament_info[3]
tournament_year = tournament_info[-1]
tournament_days = tournament_info[-2][:-1].split("-")
tournament_starting_day = tournament_days[0]
tournament_ending_day = tournament_days[-1]
tournament_month = tournament_info[-3]
tournament_stars = [' '.join(tournament_info[5 : 7])][0]
players = paragraphs[i].find_all('a', {'href':re.compile('.*player.*')})
home_team_player_1 = players[counter+0].text
home_team_player_2 = players[counter+1].text
away_team_player_1 = players[counter+2].text
away_team_player_2 = players[counter+3].text
#matches
match= paragraph_rows[j].split(":")[0].split()[-1].strip()
#nationalities
nationalities = ["United", "States"]
if paragraph_rows[j].split("def.")[0].split("/")[1].split("(")[0].split(" ")[3] in nationalities:
home_team_country = "United States"
else:
home_team_country = paragraph_rows[j].split("def.")[0].split("/")[1].split("(")[0].split(" ")[-2]
if paragraph_rows[j].split("def.")[1].split("/")[1].split(" ")[3] in nationalities:
away_team_country = "United States"
else:
away_team_country = paragraph_rows[j].split("def.")[1].split("/")[1].split("(")[0].split(" ")[-2]
parentheses = re.findall(r'\(.*?\)', paragraph_rows[j])
if "," in parentheses[0]:
home_team_ranking = parentheses[0].split(",")[0]
home_team_ranking = home_team_ranking[1:-1]
home_team_qualification_round = parentheses[0].split(",")[1]
home_team_qualification_round = home_team_qualification_round[1:-1]
else:
home_team_ranking = parentheses[0].split(",")[0]
home_team_ranking = home_team_ranking[1:-1]
home_team_qualification_round = None
if "," in parentheses[1]:
away_team_ranking = parentheses[1].split(",")[0]
away_team_ranking = away_team_ranking[1:-1]
away_team_qualification_round = parentheses[1].split(",")[1]
away_team_qualification_round = away_team_qualification_round[1:-1]
else:
away_team_ranking = parentheses[1].split(",")[0]
away_team_ranking = away_team_ranking[1:-1]
match_duration = parentheses[2]
match_duration = match_duration[1:-1]
away_team_qualification_round = None
# sets
sets = re.findall(r'\).*?\(', paragraph_rows[j])
sets = sets[1][1:-1]
if len(sets.split(",")) == 2:
score_set1 = sets.split(",")[0]
score_set2 = sets.split(",")[1]
score_set3 = None
if len(sets.split(",")) == 3:
score_set1 = sets.split(",")[0]
score_set2 = sets.split(",")[1]
score_set3 = sets.split(",")[2]
row = { " home_team_player_1 ": home_team_player_1 ,
" home_team_player_2": home_team_player_2,
"away_team_player_1": away_team_player_1,
"away_team_player_2":away_team_player_1,
"match": match,
"home_team_country":home_team_country,
"away_team_country": away_team_country,
"home_team_ranking": home_team_ranking,
"away_team_ranking": away_team_ranking,
"match_duration": match_duration,
"home_team_qualification_round": home_team_qualification_round,
"away_team_qualification_round": away_team_qualification_round,
"score_set1":score_set1,
"score_set2":score_set2,
"score_set3":score_set3,
"tournament_category": tournament_category,
"tournament_prize_money": tournament_prize_money,
"tournament_city": tournament_city,
"tournament_year": tournament_year,
"tournament_starting_day": tournament_starting_day,
"tournament_ending_day":tournament_ending_day,
"tournament_month":tournament_month,
"tournament_stars":tournament_stars,
"round_n": round_n
}
counter += 4
rows.append(row)
data = pd.DataFrame(rows)
data.to_csv("beachvb.csv", index = False)
I am not really experienced in web scraping. I have just started as a self-taught and find the HTML source code quite messy and poorly structured.
I want to improve my code in two ways:
Include all the missing matches (country quota matches, semifinals, bronze medal, and gold medal) and the respective category for each match (country quota matches, pool, winner's bracket, semifinals, bronze medal, and gold medal)
iterate the code for more years and tournaments from the dropdown menu at the top of the webpage
I have tried to iterate through different years but my code does not work
tournament_years = {"FIVB 2015", "FIVB 2016"}
dfs = []
for year in tournament_years:
# select desired tournament
box_year = Select(driver.find_element_by_xpath("/html/body/table[3]/tbody/tr/td/table[1]/tbody/tr[1]/td[2]/select"))
box_year.select_by_visible_text(year)
box_matches = Select(driver.find_element_by_xpath("/html/body/table[3]/tbody/tr/td/table[1]/tbody/tr[2]/td[2]/select"))
box_matches.select_by_visible_text("Matches")
The main idea was to create a list of dataframes for each year and each tournament by adding a new loop at the beginning of the code.
If someone has a better idea and technique to do so, it is really appreciated!
I have these 2 functions that are really similar except for the different format of log it will receive and return. One look and return 4 values when the other return 3.
Is there any way I can make 1 general function for these 2? Thank you
> - Borrow book: B#<day>#<Student Name>#<Book name>#<days borrowed for>
> - Return book: R#<day>#<Student Name>#<Book name>
def read_borrow_log(log):
borrow_day = []
borrow_student = []
borrow_book = []
borrow_duration = []
for line in log:
hash_func = line.find("#")
hash_day = line.find("#", hash_func+1)
hash_student = line.find("#", hash_day+1)
hash_book = line.find("#", hash_student+1)
hash_duration = line.find("#", hash_book+1)
borrow_day.append(int(line[(hash_func+1):(hash_day)]))
borrow_student.append(line[(hash_day+1):(hash_student)])
borrow_book.append(line[(hash_student+1):(hash_duration)])
borrow_duration.append(line[(hash_duration+1):])
return borrow_day, borrow_student, borrow_book, borrow_duration
def read_return_log(log):
return_day = []
return_student = []
return_book = []
for line in log:
hash_func = line.find("#")
hash_day = line.find("#", hash_func+1)
hash_student = line.find("#", hash_day+1)
return_day.append(int(line[(hash_func+1):(hash_day)]))
return_student.append(line[(hash_day+1):(hash_student)])
return_book.append(line[(hash_student+1):])
return return_day, return_student, return_book
def main():
borrow_day, borrow_student, borrow_book, borrow_duration = read_borrow_log(borrow_log)
return_day, return_student, return_book = read_return_log(return_log)
Try using python's built-in string split:
def extract_log_parts(log):
recs = []
for line in log:
recs.append(line.split('#'))
# we want the record *columns* -- transpose the table
return tuple(map(list, zip(*recs)))
one thing you might do is to make the 'extra' work done only when a certain optional parameter is passed in as shown:
def read_borrow_log(log,borrow_log=True):
borrow_day = []
borrow_student = []
borrow_book = []
if borrow_log is True:
borrow_duration = []
for line in log:
hash_func = line.find("#")
hash_day = line.find("#", hash_func + 1)
hash_student = line.find("#", hash_day + 1)
if borrow_log is True:
hash_book = line.find("#", hash_student + 1)
hash_duration = line.find("#", hash_book + 1)
borrow_day.append(int(line[(hash_func + 1):(hash_day)]))
borrow_student.append(line[(hash_day + 1):(hash_student)])
borrow_book.append(line[(hash_student + 1):(hash_duration)])
if borrow_log is True:
borrow_duration.append(line[(hash_duration + 1):])
if borrow_log is True:
return borrow_day, borrow_student, borrow_book, borrow_duration
else:
return borrow_day, borrow_student, borrow_book
def main():
borrow_day, borrow_student, borrow_book, borrow_duration = read_borrow_log(borrow_log)
return_day, return_student, return_book = read_borrow_log(return_log,borrow_log=False)
however you might want to rethink the naming convention used since this function will now do more than one thing, which is bad for documentation purposes (and is generally a bad practice to have functions do more than one thing, bad enough that i should downvote my own answer if i can)
I am trying to fill with Python a table in Word with DocxTemplate and I have some issues to do it properly. I want to use 2 dictionnaries to fill the data in 1 table, in the figure below.
Table to fill
The 2 dictionnaries are filled in a loop and I write the template document at the end.
The input document to create my dictionnaries is an DB extraction written in SQL.
My main issue is when I want to fill the table with my data in the 2 different dictionnaries.
In the code below I will give as an example the 2 dictionnaries with values in it.
# -*- coding: utf8 -*-
#
#
from docxtpl import DocxTemplate
if __name__ == "__main__":
document = DocxTemplate("template.docx")
DicoOccuTable = {'`num_carnet_adresses`': '`annuaire_telephonique`\n`carnet_adresses`\n`carnet_adresses_complement',
'`num_eleve`': '`CFA_apprentissage_ctrl_coherence`\n`CFA_apprentissage_ctrl_examen`}
DicoChamp = {'`num_carnet_adresses`': 72, '`num_eleve`': 66}
template_values = {}
#
template_values["keys"] = [[{"name":cle, "occu":val} for cle,val in DicoChamp.items()],
[{"table":vals} for cles,vals in DicoOccuTable.items()]]
#
document.render(template_values)
document.save('output/' + nomTable.replace('`','') + '.docx')
As a result the two lines for the table are created but nothing is written within...
I would like to add that it's only been 1 week that I work on Python, so I feel that I don't manage properly the different objects here.
If you have any suggestion to help me, I would appreciate it !
I put here the loop to create the dictionnaries, it may help you to understand why I coded it wrong :)
for c in ChampList:
with open("db_reference.sql", "r") as f:
listTable = []
line = f.readlines()
for l in line:
if 'CREATE TABLE' in l:
begin = True
linecreateTable = l
x = linecreateTable.split()
nomTable = x[2]
elif c in l and begin == True:
listTable.append(nomTable)
elif ') ENGINE=MyISAM DEFAULT CHARSET=latin1;' in l:
begin = False
nbreOccu=len(listTable)
Tables = "\n".join(listTable)
DicoChamp.update({c:nbreOccu})
DicoOccuTable.update({c:Tables})
# DicoChamp = {c:nbreOccu}
template_values = {}
Thank You very much !
Finally I found a solution for this problem. Here it is.
Instead of using 2 dictionnaries I created 1 dictionnary with this strucuture :
Dico = { Champ : [Occu , Tables] }
The full code for creating the table is detailed below :
from docxtpl import DocxTemplate
document = DocxTemplate("template.docx")
template_values = {}
Context = {}
for c in ChampList:
listTable = []
nbreOccu = 0
OccuTables = []
with open("db_reference.sql", "r") as g:
listTable = []
ligne = g.readlines()
for li in ligne:
if 'CREATE TABLE' in li:
begin = True
linecreateTable2 = li
y = linecreateTable2.split()
nomTable2 = y[2]
elif c in li and begin == True:
listTable.append(nomTable2)
elif ') ENGINE=MyISAM DEFAULT CHARSET=latin1;' in li:
begin = False
elif '/*!40101 SET COLLATION_CONNECTION=#OLD_COLLATION_CONNECTION */;' in li:
nbreOccu=len(listTable)
inter = "\n".join(listTable)
OccuTables.append(nbreOccu)
OccuTables.append(inter)
ChampNumPropre = c.replace('`','')
Context.update({ChampNumPropre:OccuTables})
else:
continue
template_values["keys"] = [{"label":cle, "cols":val} for cle,val in Context.items()]
#
document.render(template_values)
document.save('output/' + nomTable.replace('`','') + '.docx')
And I used a table with the following structure :
I hope you will find your answers here and good luck !
I am trying to return an array of constructed objects that are build on top of objects that I retrieve from some url plus another fields that I get from another url.
I have an array that consists of two arrays that each has about 8000 objects...
I have tried to make each object construction as a thread however it still takes a lot of time...
Any solution? Here is my code:
def get_all_players_full_data(ea_players_json):
all = []
ea_players_json = list(ea_players_json.values())
for i in range(len(ea_players_json)):
for player_obj in ea_players_json[i]:
all.append(player_obj)
for player_obj in range(len(all)):
all_data = []
with concurrent.futures.ThreadPoolExecutor(len(all)) as executor:
for player_data in all:
future = executor.submit(build_full_player_data_obj, player_data)
print(future.result())
all_data.append(future.result())
def build_full_player_data_obj(ea_player_data):
if ea_player_data.get("c") is not None:
player_full_name = ea_player_data.get("c")
else:
player_full_name = ea_player_data.get("f") + " " + ea_player_data.get("l")
player_id = ea_player_data.get("id")
# go to futhead to find all cards of that player
futhead_url_player_data = f'{FUTHEAD_PLAYER}{player_full_name}'
details_of_specific_player = json.loads(requests.get(futhead_url_player_data).content)
cards_from_the_same_id = []
for player_in_json_futhead in details_of_specific_player:
if player_in_json_futhead["player_id"] == player_id:
rating = player_in_json_futhead["rating"]
specific_card_id = player_in_json_futhead["def_id"]
revision = player_in_json_futhead["revision_type"]
name = player_in_json_futhead["full_name"]
nation = player_in_json_futhead["nation_name"]
position = player_in_json_futhead["position"]
club = player_in_json_futhead["club_name"]
cards_from_the_same_id.append(Player(specific_card_id, name, rating, revision, nation,
position, club))
return cards_from_the_same_id
Is it possible to use a for loop to search through the text of tags that correspond to a certain phrase. I've been trying to create this loop but isn't hasn't been working. Any help is appreciated thanks! Here is my code:
def parse_page(self, response):
titles2 = response.xpath('//div[#id = "mainColumn"]/h1/text()').extract_first()
year = response.xpath('//div[#id = "mainColumn"]/h1/span/text()').extract()[0].strip()
aud = response.xpath('//div[#id="scorePanel"]/div[2]')
a_score = aud.xpath('./div[1]/a/div/div[2]/div[1]/span/text()').extract()
a_count = aud.xpath('./div[2]/div[2]/text()').extract()
c_score = response.xpath('//a[#id = "tomato_meter_link"]/span/span[1]/text()').extract()[0].strip()
c_count = response.xpath('//div[#id = "scoreStats"]/div[3]/span[2]/text()').extract()[0].strip()
info = response.xpath('//div[#class="panel-body content_body"]/ul')
mp_rating = info.xpath('./li[1]/div[2]/text()').extract()[0].strip()
genre = info.xpath('./li[2]/div[2]/a/text()').extract_first()
date = info.xpath('./li[5]/div[2]/time/text()').extract_first()
box = response.xpath('//section[#class = "panel panel-rt panel-box "]/div')
actor1 = box.xpath('./div/div[1]/div/a/span/text()').extract()
actor2 = box.xpath('./div/div[2]/div/a/span/text()').extract()
actor3 = box.xpath('./div/div[3]/div/a/span/text()').extract_first()
for x in info.xpath('//li'):
if info.xpath("./li[x]/div[1][contains(text(), 'Box Office: ')/text()]]
box_office = info.xpath('./li[x]/div[2]/text()')
else if info.xpath('./li[x]/div[1]/text()').extract[0] == "Runtime: "):
runtime = info.xpath('./li[x]/div[2]/time/text()')
Your for loop is completely wrong:
1. You're using info. but searching from the root
for x in info.xpath('.//li'):
2. x is a HTML node element and you can use it this way:
if x.xpath("./div[1][contains(., 'Box Office: ')]"):
box_office = x.xpath('./div[2]/text()').extract_first()
I think you might need re() or re_first() to match the certain phrase.
For example:
elif info.xpath('./li[x]/div[1]/text()').re_first('Runtime:') == "Runtime: "):
runtime = info.xpath('./li[x]/div[2]/time/text()')
And you need to modify your for loop, cuz the variable x in it is actually a Selector but not a number, so it's not right to use it like this: li[x].
gangabass in the last answer made a good point on this.