首先,我是一个python新手,我对其中一些东西的工作原理只有一半的了解。我一直在尝试为一个标签项目构建单词矩阵,我希望我可以自己解决这个问题,但我没有看到很多关于我的特定错误的文档。因此,如果这是非常明显的事情,我要提前道歉。
我试图让一组函数在几个不同的变体中工作,但我总是得到"AttributeError:'list‘没有属性定义“。
import pandas as pd
from pandas import DataFrame, Series
import nltk.data
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import TreebankWordTokenizer
# Gets synsets for a given term.
def get_synset(word):
for word in wn.synsets(word):
return word.name()
#Gets definitions for a synset.
def get_def(syn):
return wn.synsets(syn).defnition()
# Creates a dataframe called sector_matrix based on another dataframe's column. Should be followed with an export.
def sector_tagger(frame):
sentences = frame.tolist()
tok_list = [tok.tokenize(w) for w in frame]
split_words = [w.lower() for sub in tok_list for w in sub]
clean_words = [w for w in split_words if w not in english_stops]
synset = [get_synset(w) for w in clean_words]
sector_matrix = DataFrame({'Categories': clean_words,
'Synsets': synset})
sec_syn = sector_matrix['Synsets'].tolist()
sector_matrix['Definition'] = [get_def(w) for w in sector_matrix['Synsets']]
return sector_matrix
在我从excel中读取的数据帧上调用函数:
test = pd.read_excel('data.xlsx')
sector_tagger函数的调用方式如下:
agri_matrix = sector_tagger(agri['Category'])
在填充DataFrame的列表理解中,以前的版本称为wn.synsets(w).definition()。另一位则试图在Jupyter Notebook中事后调用该定义。我几乎总是得到属性错误。也就是说,当我调用sector_matrix‘’Synsets‘的数据类型时,我得到了一个"object“类型,而当我打印该列时,我看不到项周围的[]。
我试过了:
()中包装"w“
中调用它‘notebook)
构建一个列表理解
奇怪的是,我昨天在玩这个游戏,可以直接在我的笔记本上做一些事情,但是(a)它很混乱,(b)没有可伸缩性,(c)它不能在我应用它的其他类别上工作。
agrimask = (df['Agri-Food']==1) & (df['Total']==1)
df_agri = df.loc[agrimask,['Category']]
agri_words = [tok.tokenize(a) for a in df_agri['Category']]
agri_cip_words = [a.lower() for sub in agri_words for a in sub]
agri_clean = [w for w in agri_cip_words if w not in english_stops]
df_agri_clean = DataFrame({'Category': agri_clean})
df_agri_clean = df_agri_clean[df_agri_clean != ','].replace('horticulture/horticultural','horticulture').dropna().drop_duplicates()
df_agri_clean['Synsets'] = [x[0].name() for x in df_agri_clean['Category'].apply(syn)]
df_agri_clean['Definition'] = [wn.synset(x).definition() for x in df_agri_clean['Synsets']]
df_agri_clean['Lemma'] = [wn.synset(x).lemmas()[0].name() for x in df_agri_clean['Synsets']]
df_agri_clean
Edit1:这里有一个指向sample of the data的链接。
Edit2:另外,我使用的静态变量在这里(都基于标准的NLTK库):
tok = TreebankWordTokenizer()
english_stops = set(stopwords.words('english'))
french_stops = set(stopwords.words('french'))
Edit3:你可以在这里看到这个代码的工作版本:Working Code
发布于 2018-09-19 04:18:57
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import TreebankWordTokenizer as tok
english_stops = set(stopwords.words('english'))
# Gets synsets for a given term.
def get_synset(word):
for word in wn.synsets(word):
return word.name()
#Gets definitions for a synset.
def get_def(syn):
return wn.synset(syn).definition() # your definition is misspelled
# Creates a dataframe called sector_matrix based on another dataframe's column. Should be followed with an export.
def sector_tagger(frame):
tok_list = tok().tokenize(frame)
split_words = [w.lower() for w in tok_list]
clean_words = [w for w in split_words if w not in english_stops]
synset = [get_synset(w) for w in clean_words]
sector_matrix = pd.DataFrame({'Categories': clean_words,
'Synsets': synset})
sec_syn = list(sector_matrix['Synsets'])
sector_matrix['Definition'] = [get_def(w) if w != None else '' for w in sec_syn]
return sector_matrix
agri_matrix = df['Category'].apply(sector_tagger)
如果这回答了您的问题,请将其选为answer
get_def
的输出是一个短语列表
替代方法
def sector_tagger(frame):
mapping = [('/', ' '), ('(', ''), (')', ''), (',', '')]
for k, v in mapping:
frame = frame.replace(k, v)
tok_list = tok().tokenize(frame) # note () after tok
split_words = [w.lower() for w in tok_list]
clean_words = [w for w in split_words if w not in english_stops]
synset = [get_synset(w) for w in clean_words]
def_matrix = [get_def(w) if w != None else '' for w in synset]
return clean_words, synset, def_matrix
poo = df['Category'].apply(sector_tagger)
poo[0] =
(['agricultural', 'domestic', 'animal', 'services'],
['agricultural.a.01', 'domestic.n.01', 'animal.n.01', 'services.n.01'],
['relating to or used in or promoting agriculture or farming',
'a servant who is paid to perform menial tasks around the household',
'a living organism characterized by voluntary movement',
'performance of duties or provision of space and equipment helpful to others'])
list_clean_words = []
list_synset = []
list_def_matrix = []
for x in poo:
list_clean_words.append(x[0])
list_synset.append(x[1])
list_def_matrix.append(x[2])
agri_matrix = pd.DataFrame()
agri_matrix['Categories'] = list_clean_words
agri_matrix['Synsets'] = list_synset
agri_matrix['Definition'] = list_def_matrix
agri_matrix
Categories Synsets Definition
0 [agricultural, domestic, animal, services] [agricultural.a.01, domestic.n.01, animal.n.01... [relating to or used in or promoting agricultu...
1 [agricultural, food, products, processing] [agricultural.a.01, food.n.01, merchandise.n.0... [relating to or used in or promoting agricultu...
2 [agricultural, business, management] [agricultural.a.01, business.n.01, management.... [relating to or used in or promoting agricultu...
3 [agricultural, mechanization] [agricultural.a.01, mechanization.n.01] [relating to or used in or promoting agricultu...
4 [agricultural, production, operations] [agricultural.a.01, production.n.01, operation... [relating to or used in or promoting agricultu...
将每个列表拆分为一个很长的列表(它们是有序的)
def create_long_list_from_list_of_lists(list_of_lists):
long_list = []
for one_list in list_of_lists:
for word in one_list:
long_list.append(word)
return long_list
long_list_clean_words = create_long_list_from_list_of_lists(list_clean_words)
long_list_synset = create_long_list_from_list_of_lists(list_synset)
long_list_def_matrix = create_long_list_from_list_of_lists(list_def_matrix)
将其转变为Uniques Categories的DataFrame
agri_df = pd.DataFrame.from_dict(dict([('Categories', long_list_clean_words), ('Synsets', long_list_synset), ('Definitions', long_list_def_matrix)])).drop_duplicates().reset_index(drop=True)
agri_df.head(4)
Categories Synsets Definitions
0 ceramic ceramic.n.01 an artifact made of hard brittle material prod...
1 horticultural horticultural.a.01 of or relating to the cultivation of plants
2 construction construction.n.01 the act of constructing something
3 building building.n.01 a structure that has a roof and walls and stan...
结束语
import from nltk.tokenize import TreebankWordTokenizer as tok
或者:
import from nltk.tokenize import word_tokenize
要使用以下命令:
tok().tokenize(string_text_phrase) # text is a string phrase, not a list of words
或者:
word_tokenize(string_text_phrase)
这两种方法似乎都会产生相同的输出,即单词列表。
input = "Agricultural and domestic animal services"
output_of_both_methods = ['Agricultural', 'and', 'domestic', 'animal', 'services']
https://stackoverflow.com/questions/52392130
复制相似问题