Files
Gists/internal/index/bleve.go
Thomas Miceli e91139d3ec Improve code search + tests (#663)
Signed-off-by: Thomas Miceli <tho.miceli@gmail.com>
Co-authored-by: Qiang Zhou <zhouqiang.loaded@bytedance.com>
Co-authored-by: theodoruszq <theodoruszq@gmail.com>
2026-03-13 11:16:10 +08:00

384 lines
12 KiB
Go

package index
import (
"errors"
"fmt"
"os"
"strconv"
"strings"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
"github.com/blevesearch/bleve/v2/analysis/token/length"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/search/query"
"github.com/rs/zerolog/log"
"github.com/thomiceli/opengist/internal/config"
)
type BleveIndexer struct {
index bleve.Index
path string
}
func NewBleveIndexer(path string) *BleveIndexer {
return &BleveIndexer{path: path}
}
func (i *BleveIndexer) Init() error {
errChan := make(chan error, 1)
go func() {
bleveIndex, err := i.open()
if err != nil {
log.Error().Err(err).Msg("Failed to open Bleve index")
i.Close()
errChan <- err
return
}
i.index = bleveIndex
log.Info().Msg("Bleve indexer initialized")
errChan <- nil
}()
return <-errChan
}
func (i *BleveIndexer) open() (bleve.Index, error) {
bleveIndex, err := bleve.Open(i.path)
if err == nil {
return bleveIndex, nil
}
if !errors.Is(err, bleve.ErrorIndexPathDoesNotExist) {
return nil, err
}
mapping := bleve.NewIndexMapping()
// Token filters
if err = mapping.AddCustomTokenFilter("unicodeNormalize", map[string]any{
"type": unicodenorm.Name,
"form": unicodenorm.NFC,
}); err != nil {
return nil, err
}
if err = mapping.AddCustomTokenFilter("lengthMin2", map[string]interface{}{
"type": length.Name,
"min": 2.0,
}); err != nil {
return nil, err
}
// Analyzer: split mode (camelCase splitting for partial search)
// "CPUCard" -> ["cpu", "card"]
if err = mapping.AddCustomAnalyzer("codeSplit", map[string]interface{}{
"type": custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{"unicodeNormalize", camelcase.Name, lowercase.Name, "lengthMin2"},
}); err != nil {
return nil, err
}
// Analyzer: exact mode (no camelCase splitting for full-word search)
// "CPUCard" -> ["cpucard"]
if err = mapping.AddCustomAnalyzer("codeExact", map[string]interface{}{
"type": custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{"unicodeNormalize", lowercase.Name},
}); err != nil {
return nil, err
}
// Analyzer: keyword with lowercase (for Languages - single token, no splitting)
if err = mapping.AddCustomAnalyzer("lowercaseKeyword", map[string]interface{}{
"type": custom.Name,
"char_filters": []string{},
"tokenizer": "single",
"token_filters": []string{lowercase.Name},
}); err != nil {
return nil, err
}
// Document mapping
docMapping := bleve.NewDocumentMapping()
docMapping.AddFieldMappingsAt("GistID", bleve.NewNumericFieldMapping())
docMapping.AddFieldMappingsAt("UserID", bleve.NewNumericFieldMapping())
docMapping.AddFieldMappingsAt("Visibility", bleve.NewNumericFieldMapping())
// Content: dual indexing (exact + split)
// "Content" uses the property name so Bleve resolves its analyzer correctly
contentExact := bleve.NewTextFieldMapping()
contentExact.Name = "Content"
contentExact.Analyzer = "codeExact"
contentExact.Store = false
contentExact.IncludeTermVectors = true
contentSplit := bleve.NewTextFieldMapping()
contentSplit.Name = "ContentSplit"
contentSplit.Analyzer = "codeSplit"
contentSplit.Store = false
contentSplit.IncludeTermVectors = true
docMapping.AddFieldMappingsAt("Content", contentExact, contentSplit)
// Languages: keyword analyzer (preserves as single token)
languageFieldMapping := bleve.NewTextFieldMapping()
languageFieldMapping.Analyzer = "lowercaseKeyword"
docMapping.AddFieldMappingsAt("Languages", languageFieldMapping)
// All other text fields use codeSplit as default
docMapping.DefaultAnalyzer = "codeSplit"
mapping.DefaultMapping = docMapping
return bleve.New(i.path, mapping)
}
func (i *BleveIndexer) Reset() error {
i.Close()
if err := os.RemoveAll(i.path); err != nil {
return fmt.Errorf("failed to remove Bleve index directory: %w", err)
}
log.Info().Msg("Bleve index directory removed, re-creating index")
return i.Init()
}
func (i *BleveIndexer) Close() {
if i == nil || i.index == nil {
return
}
err := i.index.Close()
if err != nil {
log.Error().Err(err).Msg("Failed to close Bleve index")
}
log.Info().Msg("Bleve indexer closed")
}
func (i *BleveIndexer) Add(gist *Gist) error {
if gist == nil {
return errors.New("failed to add nil gist to index")
}
return (*atomicIndexer.Load()).(*BleveIndexer).index.Index(strconv.Itoa(int(gist.GistID)), gist)
}
func (i *BleveIndexer) Remove(gistID uint) error {
return (*atomicIndexer.Load()).(*BleveIndexer).index.Delete(strconv.Itoa(int(gistID)))
}
// Search returns a list of Gist IDs that match the given search metadata.
// The method returns an error if any.
//
// The queryMetadata parameter is used to filter the search results.
// For example, passing a non-empty Username will search for gists whose
// username matches the given string.
//
// If the "All" field in queryMetadata is non-empty, the method will
// search across all metadata fields with OR logic. Otherwise, the method
// will add each metadata field with AND logic.
//
// The page parameter is used to paginate the search results.
// The method returns the total number of search results in the second return
// value.
//
// The third return value is a map of language counts for the search results.
// The language counts are computed by asking Bleve to return the top 10
// facets for the "Languages" field.
func (i *BleveIndexer) Search(metadata SearchGistMetadata, userId uint, page int) ([]uint, uint64, map[string]int, error) {
var err error
var indexerQuery query.Query = bleve.NewMatchAllQuery()
// Query factory
factoryQuery := func(field, value string) query.Query {
query := bleve.NewMatchPhraseQuery(value)
query.SetField(field)
return query
}
// Exact search
addQuery := func(field, value string) {
if value != "" && value != "." {
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, factoryQuery(field, value))
}
}
// Query factory for text fields: exact match boosted + match query + prefix
factoryTextQuery := func(field, value string) query.Query {
exact := bleve.NewMatchPhraseQuery(value)
exact.SetField(field)
exact.SetBoost(2.0)
fuzzy := bleve.NewMatchQuery(value)
fuzzy.SetField(field)
fuzzy.SetFuzziness(1)
fuzzy.SetOperator(query.MatchQueryOperatorAnd)
queries := []query.Query{exact, fuzzy}
if len([]rune(value)) >= 2 {
prefix := bleve.NewPrefixQuery(strings.ToLower(value))
prefix.SetField(field)
prefix.SetBoost(1.5)
queries = append(queries, prefix)
}
if len([]rune(value)) >= 4 {
wildcard := bleve.NewWildcardQuery("*" + strings.ToLower(value) + "*")
wildcard.SetField(field)
wildcard.SetBoost(0.5)
queries = append(queries, wildcard)
}
return bleve.NewDisjunctionQuery(queries...)
}
// Query factory for Content: searches both exact (Content) and split (ContentSplit) fields
factoryContentQuery := func(value string) query.Query {
// Exact field (no camelCase split): matches "cpucard"
exactMatch := bleve.NewMatchQuery(value)
exactMatch.SetField("Content")
exactMatch.SetOperator(query.MatchQueryOperatorAnd)
exactMatch.SetBoost(2.0)
// Split field (camelCase split): matches "cpu", "card"
splitMatch := bleve.NewMatchQuery(value)
splitMatch.SetField("ContentSplit")
splitMatch.SetFuzziness(1)
splitMatch.SetOperator(query.MatchQueryOperatorAnd)
splitMatch.SetBoost(1.0)
queries := []query.Query{exactMatch, splitMatch}
if len([]rune(value)) >= 2 {
prefix := bleve.NewPrefixQuery(strings.ToLower(value))
prefix.SetField("Content")
prefix.SetBoost(1.5)
queries = append(queries, prefix)
}
if len([]rune(value)) >= 4 {
wildcard := bleve.NewWildcardQuery("*" + strings.ToLower(value) + "*")
wildcard.SetField("Content")
wildcard.SetBoost(0.5)
queries = append(queries, wildcard)
}
return bleve.NewDisjunctionQuery(queries...)
}
// Text field search
addTextQuery := func(field, value string) {
if value != "" && value != "." {
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, factoryTextQuery(field, value))
}
}
// Visibility filtering: show public gists (Visibility=0) OR user's own gists
visibilityZero := float64(0)
truee := true
publicQuery := bleve.NewNumericRangeInclusiveQuery(&visibilityZero, &visibilityZero, &truee, &truee)
publicQuery.SetField("Visibility")
userIdMatch := float64(userId)
userIdQuery := bleve.NewNumericRangeInclusiveQuery(&userIdMatch, &userIdMatch, &truee, &truee)
userIdQuery.SetField("UserID")
accessQuery := bleve.NewDisjunctionQuery(publicQuery, userIdQuery)
indexerQuery = bleve.NewConjunctionQuery(accessQuery, indexerQuery)
buildFieldQuery := func(field, value string) query.Query {
switch field {
case "Content":
return factoryContentQuery(value)
case "Title", "Description", "Filenames":
return factoryTextQuery(field, value)
case "Extensions":
return factoryQuery(field, "."+value)
default: // Username, Languages, Topics
return factoryQuery(field, value)
}
}
// Handle "All" field - search across all metadata fields with OR logic
if metadata.All != "" {
allQueries := make([]query.Query, 0, len(AllSearchFields))
for _, field := range AllSearchFields {
allQueries = append(allQueries, buildFieldQuery(field, metadata.All))
}
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, bleve.NewDisjunctionQuery(allQueries...))
} else {
// Original behavior: add each metadata field with AND logic
addQuery("Username", metadata.Username)
addTextQuery("Title", metadata.Title)
addTextQuery("Description", metadata.Description)
addQuery("Extensions", "."+metadata.Extension)
addTextQuery("Filenames", metadata.Filename)
addQuery("Languages", metadata.Language)
addQuery("Topics", metadata.Topic)
if metadata.Content != "" {
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, factoryContentQuery(metadata.Content))
}
// Handle default search fields from config with OR logic
if metadata.Default != "" {
var fields []string
for _, f := range strings.Split(config.C.SearchDefault, ",") {
f = strings.TrimSpace(f)
if f == "all" {
fields = AllSearchFields
break
}
if indexField, ok := SearchFieldMap[f]; ok {
fields = append(fields, indexField)
}
}
if len(fields) == 1 {
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, buildFieldQuery(fields[0], metadata.Default))
} else if len(fields) > 1 {
defaultQueries := make([]query.Query, 0, len(fields))
for _, field := range fields {
defaultQueries = append(defaultQueries, buildFieldQuery(field, metadata.Default))
}
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, bleve.NewDisjunctionQuery(defaultQueries...))
}
}
}
languageFacet := bleve.NewFacetRequest("Languages", 10)
perPage := 10
offset := (page - 1) * perPage
s := bleve.NewSearchRequestOptions(indexerQuery, perPage+1, offset, false)
s.AddFacet("languageFacet", languageFacet)
s.Fields = []string{"GistID"}
s.IncludeLocations = false
log.Debug().Interface("searchRequest", s).Msg("Bleve search request")
results, err := (*atomicIndexer.Load()).(*BleveIndexer).index.Search(s)
if err != nil {
return nil, 0, nil, err
}
gistIds := make([]uint, 0, len(results.Hits))
for _, hit := range results.Hits {
gistIds = append(gistIds, uint(hit.Fields["GistID"].(float64)))
}
languageCounts := make(map[string]int)
if facets, found := results.Facets["languageFacet"]; found {
for _, term := range facets.Terms.Terms() {
languageCounts[term.Term] = term.Count
}
}
return gistIds, results.Total, languageCounts, nil
}