Signed-off-by: Thomas Miceli <tho.miceli@gmail.com> Co-authored-by: Qiang Zhou <zhouqiang.loaded@bytedance.com> Co-authored-by: theodoruszq <theodoruszq@gmail.com>
384 lines
12 KiB
Go
384 lines
12 KiB
Go
package index
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/blevesearch/bleve/v2"
|
|
"github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
|
|
"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
|
|
"github.com/blevesearch/bleve/v2/analysis/token/length"
|
|
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
|
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
|
|
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
|
"github.com/blevesearch/bleve/v2/search/query"
|
|
"github.com/rs/zerolog/log"
|
|
"github.com/thomiceli/opengist/internal/config"
|
|
)
|
|
|
|
type BleveIndexer struct {
|
|
index bleve.Index
|
|
path string
|
|
}
|
|
|
|
func NewBleveIndexer(path string) *BleveIndexer {
|
|
return &BleveIndexer{path: path}
|
|
}
|
|
|
|
func (i *BleveIndexer) Init() error {
|
|
errChan := make(chan error, 1)
|
|
|
|
go func() {
|
|
bleveIndex, err := i.open()
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("Failed to open Bleve index")
|
|
i.Close()
|
|
errChan <- err
|
|
return
|
|
}
|
|
i.index = bleveIndex
|
|
log.Info().Msg("Bleve indexer initialized")
|
|
errChan <- nil
|
|
}()
|
|
|
|
return <-errChan
|
|
}
|
|
|
|
func (i *BleveIndexer) open() (bleve.Index, error) {
|
|
bleveIndex, err := bleve.Open(i.path)
|
|
if err == nil {
|
|
return bleveIndex, nil
|
|
}
|
|
|
|
if !errors.Is(err, bleve.ErrorIndexPathDoesNotExist) {
|
|
return nil, err
|
|
}
|
|
|
|
mapping := bleve.NewIndexMapping()
|
|
|
|
// Token filters
|
|
if err = mapping.AddCustomTokenFilter("unicodeNormalize", map[string]any{
|
|
"type": unicodenorm.Name,
|
|
"form": unicodenorm.NFC,
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = mapping.AddCustomTokenFilter("lengthMin2", map[string]interface{}{
|
|
"type": length.Name,
|
|
"min": 2.0,
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Analyzer: split mode (camelCase splitting for partial search)
|
|
// "CPUCard" -> ["cpu", "card"]
|
|
if err = mapping.AddCustomAnalyzer("codeSplit", map[string]interface{}{
|
|
"type": custom.Name,
|
|
"char_filters": []string{},
|
|
"tokenizer": unicode.Name,
|
|
"token_filters": []string{"unicodeNormalize", camelcase.Name, lowercase.Name, "lengthMin2"},
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Analyzer: exact mode (no camelCase splitting for full-word search)
|
|
// "CPUCard" -> ["cpucard"]
|
|
if err = mapping.AddCustomAnalyzer("codeExact", map[string]interface{}{
|
|
"type": custom.Name,
|
|
"char_filters": []string{},
|
|
"tokenizer": unicode.Name,
|
|
"token_filters": []string{"unicodeNormalize", lowercase.Name},
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Analyzer: keyword with lowercase (for Languages - single token, no splitting)
|
|
if err = mapping.AddCustomAnalyzer("lowercaseKeyword", map[string]interface{}{
|
|
"type": custom.Name,
|
|
"char_filters": []string{},
|
|
"tokenizer": "single",
|
|
"token_filters": []string{lowercase.Name},
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Document mapping
|
|
docMapping := bleve.NewDocumentMapping()
|
|
docMapping.AddFieldMappingsAt("GistID", bleve.NewNumericFieldMapping())
|
|
docMapping.AddFieldMappingsAt("UserID", bleve.NewNumericFieldMapping())
|
|
docMapping.AddFieldMappingsAt("Visibility", bleve.NewNumericFieldMapping())
|
|
|
|
// Content: dual indexing (exact + split)
|
|
// "Content" uses the property name so Bleve resolves its analyzer correctly
|
|
contentExact := bleve.NewTextFieldMapping()
|
|
contentExact.Name = "Content"
|
|
contentExact.Analyzer = "codeExact"
|
|
contentExact.Store = false
|
|
contentExact.IncludeTermVectors = true
|
|
|
|
contentSplit := bleve.NewTextFieldMapping()
|
|
contentSplit.Name = "ContentSplit"
|
|
contentSplit.Analyzer = "codeSplit"
|
|
contentSplit.Store = false
|
|
contentSplit.IncludeTermVectors = true
|
|
|
|
docMapping.AddFieldMappingsAt("Content", contentExact, contentSplit)
|
|
|
|
// Languages: keyword analyzer (preserves as single token)
|
|
languageFieldMapping := bleve.NewTextFieldMapping()
|
|
languageFieldMapping.Analyzer = "lowercaseKeyword"
|
|
docMapping.AddFieldMappingsAt("Languages", languageFieldMapping)
|
|
|
|
// All other text fields use codeSplit as default
|
|
docMapping.DefaultAnalyzer = "codeSplit"
|
|
mapping.DefaultMapping = docMapping
|
|
|
|
return bleve.New(i.path, mapping)
|
|
}
|
|
|
|
func (i *BleveIndexer) Reset() error {
|
|
i.Close()
|
|
if err := os.RemoveAll(i.path); err != nil {
|
|
return fmt.Errorf("failed to remove Bleve index directory: %w", err)
|
|
}
|
|
log.Info().Msg("Bleve index directory removed, re-creating index")
|
|
return i.Init()
|
|
}
|
|
|
|
func (i *BleveIndexer) Close() {
|
|
if i == nil || i.index == nil {
|
|
return
|
|
}
|
|
|
|
err := i.index.Close()
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("Failed to close Bleve index")
|
|
}
|
|
log.Info().Msg("Bleve indexer closed")
|
|
}
|
|
|
|
func (i *BleveIndexer) Add(gist *Gist) error {
|
|
if gist == nil {
|
|
return errors.New("failed to add nil gist to index")
|
|
}
|
|
return (*atomicIndexer.Load()).(*BleveIndexer).index.Index(strconv.Itoa(int(gist.GistID)), gist)
|
|
}
|
|
|
|
func (i *BleveIndexer) Remove(gistID uint) error {
|
|
return (*atomicIndexer.Load()).(*BleveIndexer).index.Delete(strconv.Itoa(int(gistID)))
|
|
}
|
|
|
|
// Search returns a list of Gist IDs that match the given search metadata.
|
|
// The method returns an error if any.
|
|
//
|
|
// The queryMetadata parameter is used to filter the search results.
|
|
// For example, passing a non-empty Username will search for gists whose
|
|
// username matches the given string.
|
|
//
|
|
// If the "All" field in queryMetadata is non-empty, the method will
|
|
// search across all metadata fields with OR logic. Otherwise, the method
|
|
// will add each metadata field with AND logic.
|
|
//
|
|
// The page parameter is used to paginate the search results.
|
|
// The method returns the total number of search results in the second return
|
|
// value.
|
|
//
|
|
// The third return value is a map of language counts for the search results.
|
|
// The language counts are computed by asking Bleve to return the top 10
|
|
// facets for the "Languages" field.
|
|
func (i *BleveIndexer) Search(metadata SearchGistMetadata, userId uint, page int) ([]uint, uint64, map[string]int, error) {
|
|
var err error
|
|
var indexerQuery query.Query = bleve.NewMatchAllQuery()
|
|
|
|
// Query factory
|
|
factoryQuery := func(field, value string) query.Query {
|
|
query := bleve.NewMatchPhraseQuery(value)
|
|
query.SetField(field)
|
|
return query
|
|
}
|
|
|
|
// Exact search
|
|
addQuery := func(field, value string) {
|
|
if value != "" && value != "." {
|
|
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, factoryQuery(field, value))
|
|
}
|
|
}
|
|
|
|
// Query factory for text fields: exact match boosted + match query + prefix
|
|
factoryTextQuery := func(field, value string) query.Query {
|
|
exact := bleve.NewMatchPhraseQuery(value)
|
|
exact.SetField(field)
|
|
exact.SetBoost(2.0)
|
|
|
|
fuzzy := bleve.NewMatchQuery(value)
|
|
fuzzy.SetField(field)
|
|
fuzzy.SetFuzziness(1)
|
|
fuzzy.SetOperator(query.MatchQueryOperatorAnd)
|
|
|
|
queries := []query.Query{exact, fuzzy}
|
|
|
|
if len([]rune(value)) >= 2 {
|
|
prefix := bleve.NewPrefixQuery(strings.ToLower(value))
|
|
prefix.SetField(field)
|
|
prefix.SetBoost(1.5)
|
|
queries = append(queries, prefix)
|
|
}
|
|
|
|
if len([]rune(value)) >= 4 {
|
|
wildcard := bleve.NewWildcardQuery("*" + strings.ToLower(value) + "*")
|
|
wildcard.SetField(field)
|
|
wildcard.SetBoost(0.5)
|
|
queries = append(queries, wildcard)
|
|
}
|
|
|
|
return bleve.NewDisjunctionQuery(queries...)
|
|
}
|
|
|
|
// Query factory for Content: searches both exact (Content) and split (ContentSplit) fields
|
|
factoryContentQuery := func(value string) query.Query {
|
|
// Exact field (no camelCase split): matches "cpucard"
|
|
exactMatch := bleve.NewMatchQuery(value)
|
|
exactMatch.SetField("Content")
|
|
exactMatch.SetOperator(query.MatchQueryOperatorAnd)
|
|
exactMatch.SetBoost(2.0)
|
|
|
|
// Split field (camelCase split): matches "cpu", "card"
|
|
splitMatch := bleve.NewMatchQuery(value)
|
|
splitMatch.SetField("ContentSplit")
|
|
splitMatch.SetFuzziness(1)
|
|
splitMatch.SetOperator(query.MatchQueryOperatorAnd)
|
|
splitMatch.SetBoost(1.0)
|
|
|
|
queries := []query.Query{exactMatch, splitMatch}
|
|
|
|
if len([]rune(value)) >= 2 {
|
|
prefix := bleve.NewPrefixQuery(strings.ToLower(value))
|
|
prefix.SetField("Content")
|
|
prefix.SetBoost(1.5)
|
|
queries = append(queries, prefix)
|
|
}
|
|
|
|
if len([]rune(value)) >= 4 {
|
|
wildcard := bleve.NewWildcardQuery("*" + strings.ToLower(value) + "*")
|
|
wildcard.SetField("Content")
|
|
wildcard.SetBoost(0.5)
|
|
queries = append(queries, wildcard)
|
|
}
|
|
|
|
return bleve.NewDisjunctionQuery(queries...)
|
|
}
|
|
|
|
// Text field search
|
|
addTextQuery := func(field, value string) {
|
|
if value != "" && value != "." {
|
|
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, factoryTextQuery(field, value))
|
|
}
|
|
}
|
|
|
|
// Visibility filtering: show public gists (Visibility=0) OR user's own gists
|
|
visibilityZero := float64(0)
|
|
truee := true
|
|
publicQuery := bleve.NewNumericRangeInclusiveQuery(&visibilityZero, &visibilityZero, &truee, &truee)
|
|
publicQuery.SetField("Visibility")
|
|
|
|
userIdMatch := float64(userId)
|
|
userIdQuery := bleve.NewNumericRangeInclusiveQuery(&userIdMatch, &userIdMatch, &truee, &truee)
|
|
userIdQuery.SetField("UserID")
|
|
|
|
accessQuery := bleve.NewDisjunctionQuery(publicQuery, userIdQuery)
|
|
indexerQuery = bleve.NewConjunctionQuery(accessQuery, indexerQuery)
|
|
|
|
buildFieldQuery := func(field, value string) query.Query {
|
|
switch field {
|
|
case "Content":
|
|
return factoryContentQuery(value)
|
|
case "Title", "Description", "Filenames":
|
|
return factoryTextQuery(field, value)
|
|
case "Extensions":
|
|
return factoryQuery(field, "."+value)
|
|
default: // Username, Languages, Topics
|
|
return factoryQuery(field, value)
|
|
}
|
|
}
|
|
|
|
// Handle "All" field - search across all metadata fields with OR logic
|
|
if metadata.All != "" {
|
|
allQueries := make([]query.Query, 0, len(AllSearchFields))
|
|
for _, field := range AllSearchFields {
|
|
allQueries = append(allQueries, buildFieldQuery(field, metadata.All))
|
|
}
|
|
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, bleve.NewDisjunctionQuery(allQueries...))
|
|
} else {
|
|
// Original behavior: add each metadata field with AND logic
|
|
addQuery("Username", metadata.Username)
|
|
addTextQuery("Title", metadata.Title)
|
|
addTextQuery("Description", metadata.Description)
|
|
addQuery("Extensions", "."+metadata.Extension)
|
|
addTextQuery("Filenames", metadata.Filename)
|
|
addQuery("Languages", metadata.Language)
|
|
addQuery("Topics", metadata.Topic)
|
|
if metadata.Content != "" {
|
|
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, factoryContentQuery(metadata.Content))
|
|
}
|
|
|
|
// Handle default search fields from config with OR logic
|
|
if metadata.Default != "" {
|
|
var fields []string
|
|
for _, f := range strings.Split(config.C.SearchDefault, ",") {
|
|
f = strings.TrimSpace(f)
|
|
if f == "all" {
|
|
fields = AllSearchFields
|
|
break
|
|
}
|
|
if indexField, ok := SearchFieldMap[f]; ok {
|
|
fields = append(fields, indexField)
|
|
}
|
|
}
|
|
if len(fields) == 1 {
|
|
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, buildFieldQuery(fields[0], metadata.Default))
|
|
} else if len(fields) > 1 {
|
|
defaultQueries := make([]query.Query, 0, len(fields))
|
|
for _, field := range fields {
|
|
defaultQueries = append(defaultQueries, buildFieldQuery(field, metadata.Default))
|
|
}
|
|
indexerQuery = bleve.NewConjunctionQuery(indexerQuery, bleve.NewDisjunctionQuery(defaultQueries...))
|
|
}
|
|
}
|
|
}
|
|
|
|
languageFacet := bleve.NewFacetRequest("Languages", 10)
|
|
|
|
perPage := 10
|
|
offset := (page - 1) * perPage
|
|
|
|
s := bleve.NewSearchRequestOptions(indexerQuery, perPage+1, offset, false)
|
|
s.AddFacet("languageFacet", languageFacet)
|
|
s.Fields = []string{"GistID"}
|
|
s.IncludeLocations = false
|
|
|
|
log.Debug().Interface("searchRequest", s).Msg("Bleve search request")
|
|
|
|
results, err := (*atomicIndexer.Load()).(*BleveIndexer).index.Search(s)
|
|
if err != nil {
|
|
return nil, 0, nil, err
|
|
}
|
|
|
|
gistIds := make([]uint, 0, len(results.Hits))
|
|
for _, hit := range results.Hits {
|
|
gistIds = append(gistIds, uint(hit.Fields["GistID"].(float64)))
|
|
}
|
|
|
|
languageCounts := make(map[string]int)
|
|
if facets, found := results.Facets["languageFacet"]; found {
|
|
for _, term := range facets.Terms.Terms() {
|
|
languageCounts[term.Term] = term.Count
|
|
}
|
|
}
|
|
|
|
return gistIds, results.Total, languageCounts, nil
|
|
}
|