给定字符串向量的向量,如:
sentences = [ ["Julia", "is", "1000x", "faster", "than", "Python!"],
["Julia", "reads", "beautiful!"],
["Python", "has", "600", "times", "more", "libraries"]
]我试图在每个标记中筛选出一些标记,而不丢失外部向量结构(即,不将向量简化为单个标记列表)。
到目前为止,我已经使用经典的for循环实现了这一点:
number_of_alphabetical_tokens = []
number_of_long_tokens = []
total_tokens = []
for sent in sentences
append!(number_of_alphabetical_tokens, length([token for token in sent if all(isletter, token)]))
append!(number_of_long_words, length([token for token in sent if length(token) > 2]))
append!(total_tokens, length(sent))
end
collect(zip(number_of_alphabetical_tokens, number_of_long_words, total_tokens))产出:(按per @shayan观察编辑)
3-element Vector{Tuple{Any, Any, Any}}:
(4, 5, 6)
(2, 3, 3)
(5, 6, 6)这就完成了任务,但是比我想的要花更多的时间(我有6000+文档,每个文档都有数千个句子……),看起来有点像反模式。
有没有办法通过理解或广播(或任何更多的表演性方法)来做到这一点?
发布于 2022-10-17 18:54:30
在Julia中,没有理由因为性能原因而避免循环。循环是快速的,矢量化代码只是伪装的循环。
下面是一个使用循环和一些简化(如all和count )来完成此操作的示例
function wordstats(sentences)
out = Vector{NTuple{3, Int}}(undef, length(sentences))
for (i, sent) in pairs(sentences)
a = count(all(isletter, word) for word in sent)
b = count(length(word)>2 for word in sent)
c = length(sent)
out[i] = (a, b, c)
end
return out
end上面的代码没有被优化,例如,计数超过2的单词可以改进,但是它在我的笔记本上运行在大约700 is,这比矢量化解决方案要快得多。
编辑:这里的代码基本上是相同的,但是使用map do语法(所以您不必知道返回类型):
function wordstats2(sentences)
map(sentences) do sent
a = count(all(isletter, word) for word in sent)
b = count(length(word)>2 for word in sent)
c = length(sent)
return (a, b, c)
end
end发布于 2022-10-17 18:13:29
首先,我想您在编写最终结果时可能有错误;例如,您编写了7,表示sentences的第一个元素中的总令牌数,而实际上应该是6。
您可以遵循这样的过程,完全矢量化:
julia> sentences = [ ["Julia", "is", "1000x", "faster", "than", "Python!"],
["Julia", "reads", "beautiful!"],
["Python", "has", "600", "times", "more", "libraries"]
];
julia> function check_all_letter(str::String)
all(isletter, str)
end
check_all_letter (generic function with 1 method)
julia> all_letters = map(x->filter(y->check_all_letter.(y), x), sentences)
3-element Vector{Vector{String}}:
["Julia", "is", "faster", "than"]
["Julia", "reads"]
["Python", "has", "times", "more", "libraries"]
julia> length.(a)
3-element Vector{Int64}:
4
2
5我可以为number_of_long_words和total_tokens做一个类似的过程。将所有内容封装在一个函数中,我将拥有:
julia> function arbitrary_name(vec::Vector{Vector{String}})
all_letters = map(x->filter(check_all_letter, x), sentences)
long_words = map(x->filter(y->length.(y).>2, x), sentences)
total_tokens = length.(sentences)
return collect(zip( length.(all_letters),
length.(long_words),
total_tokens
)
)
end
arbitrary_name (generic function with 1 methods)
julia> arbitrary_name(sentences)
3-element Vector{Tuple{Int64, Int64, Int64}}:
(4, 5, 6)
(2, 3, 3)
(5, 6, 6)附加解释
事实上,当我编写类似于length.(y).>2的东西时,我试图通过向量化来链接一些julia函数。请考虑这个示例,以充分了解通过length.(y).>2正在发生的事情。
julia> vec = ["foo", "bar", "baz"];
julia> lengths = length.(vec)
3-element Vector{Int64}:
3
3
3
julia> more_than_two = lengths .> 2
3-element BitVector:
1
1
1
# This is exactly equal to this:
julia> length.(vec).>2
3-element BitVector:
1
1
1
# Or
julia> vec .|> length .|> x->~isless(x, 2)
3-element BitVector:
1
1
1https://stackoverflow.com/questions/74100762
复制相似问题