mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2025-01-07 14:12:21 +01:00
ad2642a8aa
* Implementation for calculating language statistics Impement saving code language statistics to database Implement rendering langauge stats Add primary laguage to show in repository list Implement repository stats indexer queue Add indexer test Refactor to use queue module * Do not timeout for queues
107 lines
2.7 KiB
Go
107 lines
2.7 KiB
Go
package enry
|
|
|
|
import (
|
|
"math"
|
|
"sort"
|
|
|
|
"github.com/src-d/enry/v2/internal/tokenizer"
|
|
)
|
|
|
|
// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
|
|
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
|
|
type Classifier interface {
|
|
Classify(content []byte, candidates map[string]float64) (languages []string)
|
|
}
|
|
|
|
type classifier struct {
|
|
languagesLogProbabilities map[string]float64
|
|
tokensLogProbabilities map[string]map[string]float64
|
|
tokensTotal float64
|
|
}
|
|
|
|
type scoredLanguage struct {
|
|
language string
|
|
score float64
|
|
}
|
|
|
|
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
|
|
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
|
|
|
|
var languages map[string]float64
|
|
if len(candidates) == 0 {
|
|
languages = c.knownLangs()
|
|
} else {
|
|
languages = make(map[string]float64, len(candidates))
|
|
for candidate, weight := range candidates {
|
|
if lang, ok := GetLanguageByAlias(candidate); ok {
|
|
candidate = lang
|
|
}
|
|
|
|
languages[candidate] = weight
|
|
}
|
|
}
|
|
|
|
empty := len(content) == 0
|
|
scoredLangs := make([]*scoredLanguage, 0, len(languages))
|
|
|
|
var tokens []string
|
|
if !empty {
|
|
tokens = tokenizer.Tokenize(content)
|
|
}
|
|
|
|
for language := range languages {
|
|
score := c.languagesLogProbabilities[language]
|
|
if !empty {
|
|
score += c.tokensLogProbability(tokens, language)
|
|
}
|
|
scoredLangs = append(scoredLangs, &scoredLanguage{
|
|
language: language,
|
|
score: score,
|
|
})
|
|
}
|
|
|
|
return sortLanguagesByScore(scoredLangs)
|
|
}
|
|
|
|
func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
|
|
sort.Stable(byScore(scoredLangs))
|
|
sortedLanguages := make([]string, 0, len(scoredLangs))
|
|
for _, scoredLang := range scoredLangs {
|
|
sortedLanguages = append(sortedLanguages, scoredLang.language)
|
|
}
|
|
|
|
return sortedLanguages
|
|
}
|
|
|
|
func (c *classifier) knownLangs() map[string]float64 {
|
|
langs := make(map[string]float64, len(c.languagesLogProbabilities))
|
|
for lang := range c.languagesLogProbabilities {
|
|
langs[lang]++
|
|
}
|
|
|
|
return langs
|
|
}
|
|
|
|
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
|
|
var sum float64
|
|
for _, token := range tokens {
|
|
sum += c.tokenProbability(token, language)
|
|
}
|
|
|
|
return sum
|
|
}
|
|
|
|
func (c *classifier) tokenProbability(token, language string) float64 {
|
|
tokenProb, ok := c.tokensLogProbabilities[language][token]
|
|
if !ok {
|
|
tokenProb = math.Log(1.000000 / c.tokensTotal)
|
|
}
|
|
|
|
return tokenProb
|
|
}
|
|
|
|
type byScore []*scoredLanguage
|
|
|
|
func (b byScore) Len() int { return len(b) }
|
|
func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
|
func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }
|