ai-code-assistant/pkg/indexer/indexer.go

package indexer

import (
	"ai-code-assistant/pkg/database"
	"ai-code-assistant/pkg/llm"
	"context"
	"github.com/tmc/langchaingo/schema"
	"github.com/tmc/langchaingo/vectorstores/pgvector"
	"log/slog"
	"os"
	"path/filepath"
	"strconv"
)

type Indexer struct {
	repoPath          string
	chunkSize         int
	force             bool
	db                *database.Database
	llm               *llm.LLM
	allowedExtensions []string
}

func New(ctx context.Context, path string, chunkSize int, force bool) *Indexer {
	return &Indexer{
		repoPath:  path,
		chunkSize: chunkSize,
		force:     force,
		db:        database.FromContext(ctx),
		llm:       llm.FromContext(ctx),
		allowedExtensions: []string{
			".go",
		},
	}
}

func (idx *Indexer) Index(ctx context.Context) error {
	repoID, didUpdate, err := idx.db.UpsertRepo(ctx, idx.repoPath)
	if err != nil {
		return err
	}

	if didUpdate && !idx.force {
		slog.Info("repo already indexed, skipping")

		return nil
	} else if didUpdate && idx.force {
		slog.Info("repo already indexed, but force flag was set, cleaning and re-indexing")
		if err := idx.cleanIndexForRepo(ctx, repoID); err != nil {
			return err
		}
	} else if !didUpdate {
		slog.Info("indexing new repository", "path", idx.repoPath, "repo_id", repoID)
	}

	if err := idx.generateFileChunks(ctx, repoID); err != nil {
		return err
	}

	return nil
}

func (idx *Indexer) cleanIndexForRepo(ctx context.Context, repoID string) error {
	conn, err := idx.db.DB(ctx)
	if err != nil {
		return err
	}
	defer conn.Release()

	if _, err := conn.Exec(ctx, "clear_chunks_for_repo", repoID); err != nil {
		return err
	}

	return nil
}

func crawlFiles(ctx context.Context, path string, cb func(ctx context.Context, filePath string) error) error {
	pathFiles, err := os.ReadDir(path)
	if err != nil {
		return err
	}

	for _, file := range pathFiles {
		filePath := filepath.Join(path, file.Name())

		if file.IsDir() {
			if err := crawlFiles(ctx, filePath, cb); err != nil {
				return err
			}
		} else {
			if err := cb(ctx, filePath); err != nil {
				return err
			}
		}
	}

	return nil
}

func (idx *Indexer) generateFileChunks(ctx context.Context, repoID string) error {
	conn, err := idx.db.DB(ctx)
	if err != nil {
		return err
	}
	defer conn.Release()

	vectorStore, err := pgvector.New(ctx,
		pgvector.WithConn(conn),
		pgvector.WithEmbedder(idx.llm.Embedder()),
		pgvector.WithCollectionName("file_chunks"),
	)
	if err != nil {
		return err
	}

	return crawlFiles(ctx, idx.repoPath, func(ctx context.Context, filePath string) error {
		chunkID := 0

		return chunkFile(ctx, filePath, idx.chunkSize, func(chunk []byte, start, end uint64) error {
			shouldIndex := false
			for _, ext := range idx.allowedExtensions {
				if filepath.Ext(filePath) == ext {
					shouldIndex = true

					break
				}
			}

			if !shouldIndex {
				return nil
			}

			slog.Info("indexing file", "chunk_id", chunkID, "chunk_size", len(chunk), "file_name", filePath)

			docs := []schema.Document{{
				PageContent: string(chunk),
				Metadata: map[string]any{
					"type":      "file_chunk",
					"file_path": filePath,
					"chunk_id":  strconv.FormatInt(int64(chunkID), 10),
					"start":     strconv.FormatUint(start, 10),
					"end":       strconv.FormatUint(end, 10),
					"repo_id":   repoID,
				},
			}}

			if _, err := vectorStore.AddDocuments(ctx, docs); err != nil {
				return err
			}

			chunkID++
			return nil
		})
	})
}

// chunkFile will take a file and return it in chunks that are suitable size to be embedded.
// This is a very simple algorithm right now, it would be better to use a lexer to identify good parts of the AST to
// split on. We could also implement a reference graph to find the most relevant files based on the relationships
// between files.
func chunkFile(_ context.Context, filePath string, maxBytes int, chunkCb func(chunk []byte, start, end uint64) error) error {
	fileBytes, err := os.ReadFile(filePath)
	if err != nil {
		return err
	}

	pos := 0
	for pos < len(fileBytes) {
		nextChunkSize := maxBytes
		if pos+maxBytes > len(fileBytes) {
			nextChunkSize = len(fileBytes) - pos
		}

		if err := chunkCb(fileBytes[pos:pos+nextChunkSize], uint64(pos), uint64(pos+nextChunkSize)); err != nil {
			return err
		}

		pos += maxBytes
	}

	return nil
}