First Working Prototype

This application is a simple proof of concept demonstrating an agent capable of taking a prompt and generating a patch implementing code satisfying the prompt along with an accompanying unit test.
2025-04-20 07:47:41 -04:00
commit 4b8b8132fd
15 changed files with 1797 additions and 0 deletions
--- a/cmd/indexer/indexer.go
+++ b/cmd/indexer/indexer.go
@@ -0,0 +1,203 @@
+package indexer
+
+import (
+	"ai-code-assistant/pkg/database"
+	"ai-code-assistant/pkg/llm"
+	"context"
+	"github.com/go-git/go-billy/v5/osfs"
+	"github.com/go-git/go-git/v5"
+	"github.com/go-git/go-git/v5/plumbing/cache"
+	"github.com/go-git/go-git/v5/storage/filesystem"
+	"github.com/google/uuid"
+	"github.com/tmc/langchaingo/schema"
+	"github.com/tmc/langchaingo/vectorstores/pgvector"
+	"github.com/urfave/cli/v3"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strconv"
+)
+
+func Command() *cli.Command {
+	return &cli.Command{
+		Name:   "indexer",
+		Usage:  "this command will index a local git repository to build context for the llm",
+		Action: (&indexer{}).run,
+		Flags: []cli.Flag{
+			&cli.StringFlag{
+				Name:     "repo",
+				Usage:    "path to git repository",
+				Required: true,
+			},
+			&cli.IntFlag{
+				Name:  "chunk-size",
+				Usage: "number of bytes to chunk files into, should be roughly 4x the number of tokens",
+				Value: 512 * 4,
+			},
+		},
+	}
+}
+
+type indexer struct {
+	db        *database.Database
+	llm       *llm.LLM
+	repoPath  string
+	repoID    string
+	chunkSize int
+}
+
+func (idx *indexer) run(ctx context.Context, cmd *cli.Command) error {
+	idx.db = database.FromContext(ctx)
+	idx.repoPath = cmd.String("repo")
+	idx.chunkSize = int(cmd.Int("chunk-size"))
+	idx.llm = llm.FromContext(ctx)
+
+	if err := idx.upsertRepo(ctx); err != nil {
+		return err
+	}
+
+	if err := idx.generateFileChunks(ctx); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (idx *indexer) upsertRepo(ctx context.Context) error {
+	gitPath := osfs.New(filepath.Join(idx.repoPath, ".git"))
+
+	gitRepo, err := git.Open(filesystem.NewStorage(gitPath, cache.NewObjectLRUDefault()), gitPath)
+	if err != nil {
+		return err
+	}
+
+	headRef, err := gitRepo.Head()
+	if err != nil {
+		return err
+	}
+
+	conn, err := idx.db.DB(ctx)
+	if err != nil {
+		return err
+	}
+	defer conn.Release()
+
+	id := uuid.NewString()
+
+	if _, err := conn.Exec(ctx, "insert_repo", id, headRef.Hash().String(), idx.repoPath); err != nil {
+		return err
+	}
+
+	idx.repoID = id
+
+	return nil
+}
+
+func crawlFiles(ctx context.Context, path string, cb func(ctx context.Context, filePath string) error) error {
+	pathFiles, err := os.ReadDir(path)
+	if err != nil {
+		return err
+	}
+
+	for _, file := range pathFiles {
+		filePath := filepath.Join(path, file.Name())
+
+		if file.IsDir() {
+			if err := crawlFiles(ctx, filePath, cb); err != nil {
+				return err
+			}
+		} else {
+			if err := cb(ctx, filePath); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func (idx *indexer) generateFileChunks(ctx context.Context) error {
+	conn, err := idx.db.DB(ctx)
+	if err != nil {
+		return err
+	}
+	defer conn.Release()
+
+	vectorStore, err := pgvector.New(ctx,
+		pgvector.WithConn(conn),
+		pgvector.WithEmbedder(idx.llm.Embedder()),
+		pgvector.WithCollectionName("file_chunks"),
+	)
+	if err != nil {
+		return err
+	}
+
+	allowedExtensions := []string{".go"}
+
+	return crawlFiles(ctx, idx.repoPath, func(ctx context.Context, filePath string) error {
+		chunkID := 0
+
+		return chunkFile(ctx, filePath, idx.chunkSize, func(chunk []byte, start, end uint64) error {
+			shouldIndex := false
+			for _, ext := range allowedExtensions {
+				if filepath.Ext(filePath) == ext {
+					shouldIndex = true
+
+					break
+				}
+			}
+
+			if !shouldIndex {
+				return nil
+			}
+
+			slog.Info("indexing file", "chunk_id", chunkID, "chunk_size", len(chunk), "file_name", filePath)
+
+			docs := []schema.Document{{
+				PageContent: string(chunk),
+				Metadata: map[string]any{
+					"type":      "file_chunk",
+					"file_path": filePath,
+					"chunk_id":  strconv.FormatInt(int64(chunkID), 10),
+					"start":     strconv.FormatUint(start, 10),
+					"end":       strconv.FormatUint(end, 10),
+					"repo_id":   idx.repoID,
+				},
+			}}
+
+			if _, err := vectorStore.AddDocuments(ctx, docs); err != nil {
+				return err
+			}
+
+			chunkID++
+			return nil
+		})
+	})
+}
+
+// chunkFile will take a file and return it in chunks that are suitable size to be embedded.
+// This is a very simple algorithm right now, it would be better to use a lexer to identify good parts of the AST to
+// split on. We could also implement a reference graph to find the most relevant files based on the relationships
+// between files.
+func chunkFile(_ context.Context, filePath string, maxBytes int, chunkCb func(chunk []byte, start, end uint64) error) error {
+	fileBytes, err := os.ReadFile(filePath)
+	if err != nil {
+		return err
+	}
+
+	pos := 0
+	for pos < len(fileBytes) {
+		nextChunkSize := maxBytes
+		if pos+maxBytes > len(fileBytes) {
+			nextChunkSize = len(fileBytes) - pos
+		}
+
+		if err := chunkCb(fileBytes[pos:pos+nextChunkSize], uint64(pos), uint64(pos+nextChunkSize)); err != nil {
+			return err
+		}
+
+		pos += maxBytes
+	}
+
+	return nil
+}