ai-code-assistant/cmd/indexer/indexer.go

package indexer

import (
	"ai-code-assistant/pkg/database"
	"ai-code-assistant/pkg/llm"
	"context"
	"github.com/go-git/go-billy/v5/osfs"
	"github.com/go-git/go-git/v5"
	"github.com/go-git/go-git/v5/plumbing/cache"
	"github.com/go-git/go-git/v5/storage/filesystem"
	"github.com/google/uuid"
	"github.com/tmc/langchaingo/schema"
	"github.com/tmc/langchaingo/vectorstores/pgvector"
	"github.com/urfave/cli/v3"
	"log/slog"
	"os"
	"path/filepath"
	"strconv"
)

func Command() *cli.Command {
	return &cli.Command{
		Name:   "indexer",
		Usage:  "this command will index a local git repository to build context for the llm",
		Action: (&indexer{}).run,
		Flags: []cli.Flag{
			&cli.StringFlag{
				Name:     "repo",
				Usage:    "path to git repository",
				Required: true,
			},
			&cli.IntFlag{
				Name:  "chunk-size",
				Usage: "number of bytes to chunk files into, should be roughly 4x the number of tokens",
				Value: 512 * 4,
			},
		},
	}
}

type indexer struct {
	db        *database.Database
	llm       *llm.LLM
	repoPath  string
	repoID    string
	chunkSize int
}

func (idx *indexer) run(ctx context.Context, cmd *cli.Command) error {
	idx.db = database.FromContext(ctx)
	idx.repoPath = cmd.String("repo")
	idx.chunkSize = int(cmd.Int("chunk-size"))
	idx.llm = llm.FromContext(ctx)

	if err := idx.upsertRepo(ctx); err != nil {
		return err
	}

	if err := idx.generateFileChunks(ctx); err != nil {
		return err
	}

	return nil
}

func (idx *indexer) upsertRepo(ctx context.Context) error {
	gitPath := osfs.New(filepath.Join(idx.repoPath, ".git"))

	gitRepo, err := git.Open(filesystem.NewStorage(gitPath, cache.NewObjectLRUDefault()), gitPath)
	if err != nil {
		return err
	}

	headRef, err := gitRepo.Head()
	if err != nil {
		return err
	}

	conn, err := idx.db.DB(ctx)
	if err != nil {
		return err
	}
	defer conn.Release()

	id := uuid.NewString()

	if _, err := conn.Exec(ctx, "insert_repo", id, headRef.Hash().String(), idx.repoPath); err != nil {
		return err
	}

	idx.repoID = id

	return nil
}

func crawlFiles(ctx context.Context, path string, cb func(ctx context.Context, filePath string) error) error {
	pathFiles, err := os.ReadDir(path)
	if err != nil {
		return err
	}

	for _, file := range pathFiles {
		filePath := filepath.Join(path, file.Name())

		if file.IsDir() {
			if err := crawlFiles(ctx, filePath, cb); err != nil {
				return err
			}
		} else {
			if err := cb(ctx, filePath); err != nil {
				return err
			}
		}
	}

	return nil
}

func (idx *indexer) generateFileChunks(ctx context.Context) error {
	conn, err := idx.db.DB(ctx)
	if err != nil {
		return err
	}
	defer conn.Release()

	vectorStore, err := pgvector.New(ctx,
		pgvector.WithConn(conn),
		pgvector.WithEmbedder(idx.llm.Embedder()),
		pgvector.WithCollectionName("file_chunks"),
	)
	if err != nil {
		return err
	}

	allowedExtensions := []string{".go"}

	return crawlFiles(ctx, idx.repoPath, func(ctx context.Context, filePath string) error {
		chunkID := 0

		return chunkFile(ctx, filePath, idx.chunkSize, func(chunk []byte, start, end uint64) error {
			shouldIndex := false
			for _, ext := range allowedExtensions {
				if filepath.Ext(filePath) == ext {
					shouldIndex = true

					break
				}
			}

			if !shouldIndex {
				return nil
			}

			slog.Info("indexing file", "chunk_id", chunkID, "chunk_size", len(chunk), "file_name", filePath)

			docs := []schema.Document{{
				PageContent: string(chunk),
				Metadata: map[string]any{
					"type":      "file_chunk",
					"file_path": filePath,
					"chunk_id":  strconv.FormatInt(int64(chunkID), 10),
					"start":     strconv.FormatUint(start, 10),
					"end":       strconv.FormatUint(end, 10),
					"repo_id":   idx.repoID,
				},
			}}

			if _, err := vectorStore.AddDocuments(ctx, docs); err != nil {
				return err
			}

			chunkID++
			return nil
		})
	})
}

// chunkFile will take a file and return it in chunks that are suitable size to be embedded.
// This is a very simple algorithm right now, it would be better to use a lexer to identify good parts of the AST to
// split on. We could also implement a reference graph to find the most relevant files based on the relationships
// between files.
func chunkFile(_ context.Context, filePath string, maxBytes int, chunkCb func(chunk []byte, start, end uint64) error) error {
	fileBytes, err := os.ReadFile(filePath)
	if err != nil {
		return err
	}

	pos := 0
	for pos < len(fileBytes) {
		nextChunkSize := maxBytes
		if pos+maxBytes > len(fileBytes) {
			nextChunkSize = len(fileBytes) - pos
		}

		if err := chunkCb(fileBytes[pos:pos+nextChunkSize], uint64(pos), uint64(pos+nextChunkSize)); err != nil {
			return err
		}

		pos += maxBytes
	}

	return nil
}