package indexer import ( "ai-code-assistant/pkg/database" "ai-code-assistant/pkg/llm" "context" "github.com/go-git/go-billy/v5/osfs" "github.com/go-git/go-git/v5" "github.com/go-git/go-git/v5/plumbing/cache" "github.com/go-git/go-git/v5/storage/filesystem" "github.com/google/uuid" "github.com/tmc/langchaingo/schema" "github.com/tmc/langchaingo/vectorstores/pgvector" "github.com/urfave/cli/v3" "log/slog" "os" "path/filepath" "strconv" ) func Command() *cli.Command { return &cli.Command{ Name: "indexer", Usage: "this command will index a local git repository to build context for the llm", Action: (&indexer{}).run, Flags: []cli.Flag{ &cli.StringFlag{ Name: "repo", Usage: "path to git repository", Required: true, }, &cli.IntFlag{ Name: "chunk-size", Usage: "number of bytes to chunk files into, should be roughly 4x the number of tokens", Value: 512 * 4, }, }, } } type indexer struct { db *database.Database llm *llm.LLM repoPath string repoID string chunkSize int } func (idx *indexer) run(ctx context.Context, cmd *cli.Command) error { idx.db = database.FromContext(ctx) idx.repoPath = cmd.String("repo") idx.chunkSize = int(cmd.Int("chunk-size")) idx.llm = llm.FromContext(ctx) if err := idx.upsertRepo(ctx); err != nil { return err } if err := idx.generateFileChunks(ctx); err != nil { return err } return nil } func (idx *indexer) upsertRepo(ctx context.Context) error { gitPath := osfs.New(filepath.Join(idx.repoPath, ".git")) gitRepo, err := git.Open(filesystem.NewStorage(gitPath, cache.NewObjectLRUDefault()), gitPath) if err != nil { return err } headRef, err := gitRepo.Head() if err != nil { return err } conn, err := idx.db.DB(ctx) if err != nil { return err } defer conn.Release() id := uuid.NewString() if _, err := conn.Exec(ctx, "insert_repo", id, headRef.Hash().String(), idx.repoPath); err != nil { return err } idx.repoID = id return nil } func crawlFiles(ctx context.Context, path string, cb func(ctx context.Context, filePath string) error) error { pathFiles, err := os.ReadDir(path) if err != nil { return err } for _, file := range pathFiles { filePath := filepath.Join(path, file.Name()) if file.IsDir() { if err := crawlFiles(ctx, filePath, cb); err != nil { return err } } else { if err := cb(ctx, filePath); err != nil { return err } } } return nil } func (idx *indexer) generateFileChunks(ctx context.Context) error { conn, err := idx.db.DB(ctx) if err != nil { return err } defer conn.Release() vectorStore, err := pgvector.New(ctx, pgvector.WithConn(conn), pgvector.WithEmbedder(idx.llm.Embedder()), pgvector.WithCollectionName("file_chunks"), ) if err != nil { return err } allowedExtensions := []string{".go"} return crawlFiles(ctx, idx.repoPath, func(ctx context.Context, filePath string) error { chunkID := 0 return chunkFile(ctx, filePath, idx.chunkSize, func(chunk []byte, start, end uint64) error { shouldIndex := false for _, ext := range allowedExtensions { if filepath.Ext(filePath) == ext { shouldIndex = true break } } if !shouldIndex { return nil } slog.Info("indexing file", "chunk_id", chunkID, "chunk_size", len(chunk), "file_name", filePath) docs := []schema.Document{{ PageContent: string(chunk), Metadata: map[string]any{ "type": "file_chunk", "file_path": filePath, "chunk_id": strconv.FormatInt(int64(chunkID), 10), "start": strconv.FormatUint(start, 10), "end": strconv.FormatUint(end, 10), "repo_id": idx.repoID, }, }} if _, err := vectorStore.AddDocuments(ctx, docs); err != nil { return err } chunkID++ return nil }) }) } // chunkFile will take a file and return it in chunks that are suitable size to be embedded. // This is a very simple algorithm right now, it would be better to use a lexer to identify good parts of the AST to // split on. We could also implement a reference graph to find the most relevant files based on the relationships // between files. func chunkFile(_ context.Context, filePath string, maxBytes int, chunkCb func(chunk []byte, start, end uint64) error) error { fileBytes, err := os.ReadFile(filePath) if err != nil { return err } pos := 0 for pos < len(fileBytes) { nextChunkSize := maxBytes if pos+maxBytes > len(fileBytes) { nextChunkSize = len(fileBytes) - pos } if err := chunkCb(fileBytes[pos:pos+nextChunkSize], uint64(pos), uint64(pos+nextChunkSize)); err != nil { return err } pos += maxBytes } return nil }