// Copyright 2024 The Forgejo Authors. All rights reserved. // SPDX-License-Identifier: MIT package hierarchy import ( "bytes" "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/registry" ) const Name = "path_hierarchy" type PathHierarchyTokenizer struct{} // Similar to elastic's path_hierarchy tokenizer // This tokenizes a given path into all the possible hierarchies // For example, // modules/indexer/code/search.go => // // modules/ // modules/indexer // modules/indexer/code // modules/indexer/code/search.go func (t *PathHierarchyTokenizer) Tokenize(input []byte) analysis.TokenStream { // trim any extra slashes input = bytes.Trim(input, "/") // zero allocations until the nested directories exceed a depth of 8 (which is unlikely) rv := make(analysis.TokenStream, 0, 8) count, off := 1, 0 // iterate till all directory seperators for i := bytes.IndexRune(input[off:], '/'); i != -1; i = bytes.IndexRune(input[off:], '/') { // the index is relative to input[offest...] // add this index to the accumlated offset to get the index of the current seperator in input[0...] off += i rv = append(rv, &analysis.Token{ Term: input[:off], // take the slice, input[0...index of seperator] Start: 0, End: off, Position: count, Type: analysis.AlphaNumeric, }) // increment the offset after considering the seperator off++ count++ } // the entire file path should always be the last token rv = append(rv, &analysis.Token{ Term: input, Start: 0, End: len(input), Position: count, Type: analysis.AlphaNumeric, }) return rv } func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) { return &PathHierarchyTokenizer{}, nil } func init() { registry.RegisterTokenizer(Name, TokenizerConstructor) }