设计搜索引擎

问题

如何用 Go 实现一个简易搜索引擎？理解倒排索引和搜索核心原理。

答案

核心架构

倒排索引实现

// 文档
type Document struct {
    ID      int
    Title   string
    Content string
}

// 倒排索引：词 → 文档ID列表
type InvertedIndex struct {
    mu    sync.RWMutex
    index map[string][]int // term → docIDs
    docs  map[int]Document  // docID → doc
}

func NewInvertedIndex() *InvertedIndex {
    return &InvertedIndex{
        index: make(map[string][]int),
        docs:  make(map[int]Document),
    }
}

// 简单分词（英文按空格，中文可用 jieba）
func tokenize(text string) []string {
    text = strings.ToLower(text)
    words := strings.Fields(text)
    // 去停用词
    var tokens []string
    for _, w := range words {
        w = strings.Trim(w, ".,!?;:")
        if len(w) > 1 { // 过滤单字符
            tokens = append(tokens, w)
        }
    }
    return tokens
}

// 添加文档到索引
func (idx *InvertedIndex) Add(doc Document) {
    idx.mu.Lock()
    defer idx.mu.Unlock()

    idx.docs[doc.ID] = doc
    tokens := tokenize(doc.Title + " " + doc.Content)
    seen := make(map[string]bool) // 同一文档去重

    for _, token := range tokens {
        if !seen[token] {
            idx.index[token] = append(idx.index[token], doc.ID)
            seen[token] = true
        }
    }
}

// 搜索：支持多词 AND 查询
func (idx *InvertedIndex) Search(query string) []Document {
    idx.mu.RLock()
    defer idx.mu.RUnlock()

    tokens := tokenize(query)
    if len(tokens) == 0 {
        return nil
    }

    // 取每个词的文档列表的交集
    result := make(map[int]int) // docID → 匹配词数
    for _, token := range tokens {
        for _, docID := range idx.index[token] {
            result[docID]++
        }
    }

    // 过滤：匹配所有词的文档
    var docs []Document
    for docID, count := range result {
        if count == len(tokens) { // AND 逻辑
            docs = append(docs, idx.docs[docID])
        }
    }
    return docs
}

TF-IDF 相关性排序

import "math"

// TF: 词在文档中出现的频率
func tf(term string, doc string) float64 {
    tokens := tokenize(doc)
    count := 0
    for _, t := range tokens {
        if t == term {
            count++
        }
    }
    return float64(count) / float64(len(tokens))
}

// IDF: 逆文档频率，越稀有权重越高
func idf(term string, totalDocs int, docFreq int) float64 {
    return math.Log(float64(totalDocs) / float64(1+docFreq))
}

// TF-IDF 评分函数
func (idx *InvertedIndex) SearchWithScore(query string) []SearchResult {
    tokens := tokenize(query)
    scores := make(map[int]float64)

    totalDocs := len(idx.docs)
    for _, token := range tokens {
        docIDs := idx.index[token]
        idfScore := idf(token, totalDocs, len(docIDs))

        for _, docID := range docIDs {
            doc := idx.docs[docID]
            tfScore := tf(token, doc.Content)
            scores[docID] += tfScore * idfScore
        }
    }

    // 按分数降序排列
    var results []SearchResult
    for docID, score := range scores {
        results = append(results, SearchResult{Doc: idx.docs[docID], Score: score})
    }
    sort.Slice(results, func(i, j int) bool {
        return results[i].Score > results[j].Score
    })
    return results
}

集成 Elasticsearch

import (
    "github.com/elastic/go-elasticsearch/v8"
    "github.com/elastic/go-elasticsearch/v8/typedapi/core/search"
    "github.com/elastic/go-elasticsearch/v8/typedapi/types"
)

func ESSearch(es *elasticsearch.TypedClient) {
    // 创建索引
    es.Indices.Create("articles").Do(context.Background())

    // 索引文档
    es.Index("articles").
        Id("1").
        Document(map[string]string{
            "title":   "Go 并发编程",
            "content": "Goroutine 和 Channel 是 Go 并发的基石",
        }).
        Do(context.Background())

    // 搜索
    resp, _ := es.Search().
        Index("articles").
        Request(&search.Request{
            Query: &types.Query{
                MultiMatch: &types.MultiMatchQuery{
                    Query:  "Go 并发",
                    Fields: []string{"title^2", "content"}, // title 权重 x2
                },
            },
        }).
        Do(context.Background())

    for _, hit := range resp.Hits.Hits {
        log.Printf("ID: %s, Score: %f", *hit.Id_, *hit.Score_)
    }
}

搜索优化

优化点	方案
中文分词	jieba-go / IK 分词器
拼写纠错	编辑距离 + 词频
搜索建议	Trie 前缀树 / Elasticsearch Suggest
高亮	返回结果标记匹配词
缓存	热门查询结果缓存

常见面试问题

Q1: 倒排索引和正排索引有什么区别？

答案：

正排索引：文档 ID → 文档内容（数据库主键查询）
倒排索引：关键词 → 文档 ID 列表（全文搜索）
搜索引擎核心就是倒排索引

Q2: Elasticsearch 为什么快？

答案：

倒排索引 + FST（有限状态转换器）压缩词典
分片并行查询
文件系统缓存（ES 推荐给一半内存给 OS cache）
Skip List 加速 Posting List 合并

问题​

答案​

核心架构​

倒排索引实现​

TF-IDF 相关性排序​

集成 Elasticsearch​

搜索优化​

常见面试问题​

Q1: 倒排索引和正排索引有什么区别？​

Q2: Elasticsearch 为什么快？​

相关链接​

问题

答案

核心架构

倒排索引实现

TF-IDF 相关性排序

集成 Elasticsearch

搜索优化

常见面试问题

Q1: 倒排索引和正排索引有什么区别？

Q2: Elasticsearch 为什么快？

相关链接