mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2025-02-06 20:49:48 +01:00
70 lines
1.8 KiB
Go
70 lines
1.8 KiB
Go
|
// Copyright 2024 The Forgejo Authors. All rights reserved.
|
||
|
// SPDX-License-Identifier: MIT
|
||
|
|
||
|
package hierarchy
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
|
||
|
"github.com/blevesearch/bleve/v2/analysis"
|
||
|
"github.com/blevesearch/bleve/v2/registry"
|
||
|
)
|
||
|
|
||
|
const Name = "path_hierarchy"
|
||
|
|
||
|
type PathHierarchyTokenizer struct{}
|
||
|
|
||
|
// Similar to elastic's path_hierarchy tokenizer
|
||
|
// This tokenizes a given path into all the possible hierarchies
|
||
|
// For example,
|
||
|
// modules/indexer/code/search.go =>
|
||
|
//
|
||
|
// modules/
|
||
|
// modules/indexer
|
||
|
// modules/indexer/code
|
||
|
// modules/indexer/code/search.go
|
||
|
func (t *PathHierarchyTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||
|
// trim any extra slashes
|
||
|
input = bytes.Trim(input, "/")
|
||
|
|
||
|
// zero allocations until the nested directories exceed a depth of 8 (which is unlikely)
|
||
|
rv := make(analysis.TokenStream, 0, 8)
|
||
|
count, off := 1, 0
|
||
|
|
||
|
// iterate till all directory seperators
|
||
|
for i := bytes.IndexRune(input[off:], '/'); i != -1; i = bytes.IndexRune(input[off:], '/') {
|
||
|
// the index is relative to input[offest...]
|
||
|
// add this index to the accumlated offset to get the index of the current seperator in input[0...]
|
||
|
off += i
|
||
|
rv = append(rv, &analysis.Token{
|
||
|
Term: input[:off], // take the slice, input[0...index of seperator]
|
||
|
Start: 0,
|
||
|
End: off,
|
||
|
Position: count,
|
||
|
Type: analysis.AlphaNumeric,
|
||
|
})
|
||
|
// increment the offset after considering the seperator
|
||
|
off++
|
||
|
count++
|
||
|
}
|
||
|
|
||
|
// the entire file path should always be the last token
|
||
|
rv = append(rv, &analysis.Token{
|
||
|
Term: input,
|
||
|
Start: 0,
|
||
|
End: len(input),
|
||
|
Position: count,
|
||
|
Type: analysis.AlphaNumeric,
|
||
|
})
|
||
|
|
||
|
return rv
|
||
|
}
|
||
|
|
||
|
func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||
|
return &PathHierarchyTokenizer{}, nil
|
||
|
}
|
||
|
|
||
|
func init() {
|
||
|
registry.RegisterTokenizer(Name, TokenizerConstructor)
|
||
|
}
|