mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2024-12-27 00:32:14 +01:00
9958642502
Fix regression of #5363 (so long ago). The old code definded a document mapping for `issueIndexerDocType`, and assigned it to `BleveIndexerData` as its type. (`BleveIndexerData` has been renamed to `IndexerData` in #25174, but nothing more.) But the old code never used `BleveIndexerData`, it wrote the index with an anonymous struct type. Nonetheless, bleve would use the default auto-mapping for struct it didn't know, so the indexer still worked. This means the custom document mapping was always dead code. The custom document mapping is not useless, it can reduce index storage, this PR brings it back and disable default mapping to prevent it from happening again. Since `IndexerData`(`BleveIndexerData`) has JSON tags, and bleve uses them first, so we should use `repo_id` as the field name instead of `RepoID`. I did a test to compare the storage size before and after this, with about 3k real comments that were migrated from some public repos. Before: ```text [ 160] . ├── [ 42] index_meta.json ├── [ 13] rupture_meta.json └── [ 128] store ├── [6.9M] 00000000005d.zap └── [256K] root.bolt ``` After: ```text [ 160] . ├── [ 42] index_meta.json ├── [ 13] rupture_meta.json └── [ 128] store ├── [3.5M] 000000000065.zap └── [256K] root.bolt ``` It saves about half the storage space. --------- Co-authored-by: Giteabot <teabot@gitea.io>
179 lines
5.7 KiB
Go
179 lines
5.7 KiB
Go
// Copyright 2018 The Gitea Authors. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
package bleve
|
|
|
|
import (
|
|
"context"
|
|
|
|
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
|
|
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
|
|
"code.gitea.io/gitea/modules/indexer/issues/internal"
|
|
|
|
"github.com/blevesearch/bleve/v2"
|
|
"github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
|
|
"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
|
|
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
|
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
|
|
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
|
"github.com/blevesearch/bleve/v2/mapping"
|
|
"github.com/blevesearch/bleve/v2/search/query"
|
|
)
|
|
|
|
const (
|
|
issueIndexerAnalyzer = "issueIndexer"
|
|
issueIndexerDocType = "issueIndexerDocType"
|
|
issueIndexerLatestVersion = 3
|
|
)
|
|
|
|
// numericEqualityQuery a numeric equality query for the given value and field
|
|
func numericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
|
|
f := float64(value)
|
|
tru := true
|
|
q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru)
|
|
q.SetField(field)
|
|
return q
|
|
}
|
|
|
|
func newMatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQuery {
|
|
q := bleve.NewMatchPhraseQuery(matchPhrase)
|
|
q.FieldVal = field
|
|
q.Analyzer = analyzer
|
|
return q
|
|
}
|
|
|
|
const unicodeNormalizeName = "unicodeNormalize"
|
|
|
|
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
|
|
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]interface{}{
|
|
"type": unicodenorm.Name,
|
|
"form": unicodenorm.NFC,
|
|
})
|
|
}
|
|
|
|
const maxBatchSize = 16
|
|
|
|
// IndexerData an update to the issue indexer
|
|
type IndexerData internal.IndexerData
|
|
|
|
// Type returns the document type, for bleve's mapping.Classifier interface.
|
|
func (i *IndexerData) Type() string {
|
|
return issueIndexerDocType
|
|
}
|
|
|
|
// generateIssueIndexMapping generates the bleve index mapping for issues
|
|
func generateIssueIndexMapping() (mapping.IndexMapping, error) {
|
|
mapping := bleve.NewIndexMapping()
|
|
docMapping := bleve.NewDocumentMapping()
|
|
|
|
numericFieldMapping := bleve.NewNumericFieldMapping()
|
|
numericFieldMapping.Store = false
|
|
numericFieldMapping.IncludeInAll = false
|
|
docMapping.AddFieldMappingsAt("repo_id", numericFieldMapping)
|
|
|
|
textFieldMapping := bleve.NewTextFieldMapping()
|
|
textFieldMapping.Store = false
|
|
textFieldMapping.IncludeInAll = false
|
|
docMapping.AddFieldMappingsAt("title", textFieldMapping)
|
|
docMapping.AddFieldMappingsAt("content", textFieldMapping)
|
|
docMapping.AddFieldMappingsAt("comments", textFieldMapping)
|
|
|
|
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
|
|
return nil, err
|
|
} else if err = mapping.AddCustomAnalyzer(issueIndexerAnalyzer, map[string]interface{}{
|
|
"type": custom.Name,
|
|
"char_filters": []string{},
|
|
"tokenizer": unicode.Name,
|
|
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
mapping.DefaultAnalyzer = issueIndexerAnalyzer
|
|
mapping.AddDocumentMapping(issueIndexerDocType, docMapping)
|
|
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
|
|
mapping.DefaultMapping = bleve.NewDocumentDisabledMapping() // disable default mapping, avoid indexing unexpected structs
|
|
|
|
return mapping, nil
|
|
}
|
|
|
|
var _ internal.Indexer = &Indexer{}
|
|
|
|
// Indexer implements Indexer interface
|
|
type Indexer struct {
|
|
inner *inner_bleve.Indexer
|
|
indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
|
|
}
|
|
|
|
// NewIndexer creates a new bleve local indexer
|
|
func NewIndexer(indexDir string) *Indexer {
|
|
inner := inner_bleve.NewIndexer(indexDir, issueIndexerLatestVersion, generateIssueIndexMapping)
|
|
return &Indexer{
|
|
Indexer: inner,
|
|
inner: inner,
|
|
}
|
|
}
|
|
|
|
// Index will save the index data
|
|
func (b *Indexer) Index(_ context.Context, issues []*internal.IndexerData) error {
|
|
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
|
|
for _, issue := range issues {
|
|
if err := batch.Index(indexer_internal.Base36(issue.ID), (*IndexerData)(issue)); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return batch.Flush()
|
|
}
|
|
|
|
// Delete deletes indexes by ids
|
|
func (b *Indexer) Delete(_ context.Context, ids ...int64) error {
|
|
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
|
|
for _, id := range ids {
|
|
if err := batch.Delete(indexer_internal.Base36(id)); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return batch.Flush()
|
|
}
|
|
|
|
// Search searches for issues by given conditions.
|
|
// Returns the matching issue IDs
|
|
func (b *Indexer) Search(ctx context.Context, keyword string, repoIDs []int64, limit, start int) (*internal.SearchResult, error) {
|
|
var repoQueriesP []*query.NumericRangeQuery
|
|
for _, repoID := range repoIDs {
|
|
repoQueriesP = append(repoQueriesP, numericEqualityQuery(repoID, "repo_id"))
|
|
}
|
|
repoQueries := make([]query.Query, len(repoQueriesP))
|
|
for i, v := range repoQueriesP {
|
|
repoQueries[i] = query.Query(v)
|
|
}
|
|
|
|
indexerQuery := bleve.NewConjunctionQuery(
|
|
bleve.NewDisjunctionQuery(repoQueries...),
|
|
bleve.NewDisjunctionQuery(
|
|
newMatchPhraseQuery(keyword, "title", issueIndexerAnalyzer),
|
|
newMatchPhraseQuery(keyword, "content", issueIndexerAnalyzer),
|
|
newMatchPhraseQuery(keyword, "comments", issueIndexerAnalyzer),
|
|
))
|
|
search := bleve.NewSearchRequestOptions(indexerQuery, limit, start, false)
|
|
search.SortBy([]string{"-_score"})
|
|
|
|
result, err := b.inner.Indexer.SearchInContext(ctx, search)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
ret := internal.SearchResult{
|
|
Hits: make([]internal.Match, 0, len(result.Hits)),
|
|
}
|
|
for _, hit := range result.Hits {
|
|
id, err := indexer_internal.ParseBase36(hit.ID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
ret.Hits = append(ret.Hits, internal.Match{
|
|
ID: id,
|
|
})
|
|
}
|
|
return &ret, nil
|
|
}
|