First Working Prototype
This application is a simple proof of concept demonstrating an agent capable of taking a prompt and generating a patch implementing code satisfying the prompt along with an accompanying unit test.
This commit is contained in:
203
cmd/indexer/indexer.go
Normal file
203
cmd/indexer/indexer.go
Normal file
@@ -0,0 +1,203 @@
|
||||
package indexer
|
||||
|
||||
import (
|
||||
"ai-code-assistant/pkg/database"
|
||||
"ai-code-assistant/pkg/llm"
|
||||
"context"
|
||||
"github.com/go-git/go-billy/v5/osfs"
|
||||
"github.com/go-git/go-git/v5"
|
||||
"github.com/go-git/go-git/v5/plumbing/cache"
|
||||
"github.com/go-git/go-git/v5/storage/filesystem"
|
||||
"github.com/google/uuid"
|
||||
"github.com/tmc/langchaingo/schema"
|
||||
"github.com/tmc/langchaingo/vectorstores/pgvector"
|
||||
"github.com/urfave/cli/v3"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func Command() *cli.Command {
|
||||
return &cli.Command{
|
||||
Name: "indexer",
|
||||
Usage: "this command will index a local git repository to build context for the llm",
|
||||
Action: (&indexer{}).run,
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "repo",
|
||||
Usage: "path to git repository",
|
||||
Required: true,
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "chunk-size",
|
||||
Usage: "number of bytes to chunk files into, should be roughly 4x the number of tokens",
|
||||
Value: 512 * 4,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
type indexer struct {
|
||||
db *database.Database
|
||||
llm *llm.LLM
|
||||
repoPath string
|
||||
repoID string
|
||||
chunkSize int
|
||||
}
|
||||
|
||||
func (idx *indexer) run(ctx context.Context, cmd *cli.Command) error {
|
||||
idx.db = database.FromContext(ctx)
|
||||
idx.repoPath = cmd.String("repo")
|
||||
idx.chunkSize = int(cmd.Int("chunk-size"))
|
||||
idx.llm = llm.FromContext(ctx)
|
||||
|
||||
if err := idx.upsertRepo(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := idx.generateFileChunks(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (idx *indexer) upsertRepo(ctx context.Context) error {
|
||||
gitPath := osfs.New(filepath.Join(idx.repoPath, ".git"))
|
||||
|
||||
gitRepo, err := git.Open(filesystem.NewStorage(gitPath, cache.NewObjectLRUDefault()), gitPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
headRef, err := gitRepo.Head()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
conn, err := idx.db.DB(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer conn.Release()
|
||||
|
||||
id := uuid.NewString()
|
||||
|
||||
if _, err := conn.Exec(ctx, "insert_repo", id, headRef.Hash().String(), idx.repoPath); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
idx.repoID = id
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func crawlFiles(ctx context.Context, path string, cb func(ctx context.Context, filePath string) error) error {
|
||||
pathFiles, err := os.ReadDir(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, file := range pathFiles {
|
||||
filePath := filepath.Join(path, file.Name())
|
||||
|
||||
if file.IsDir() {
|
||||
if err := crawlFiles(ctx, filePath, cb); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err := cb(ctx, filePath); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (idx *indexer) generateFileChunks(ctx context.Context) error {
|
||||
conn, err := idx.db.DB(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer conn.Release()
|
||||
|
||||
vectorStore, err := pgvector.New(ctx,
|
||||
pgvector.WithConn(conn),
|
||||
pgvector.WithEmbedder(idx.llm.Embedder()),
|
||||
pgvector.WithCollectionName("file_chunks"),
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
allowedExtensions := []string{".go"}
|
||||
|
||||
return crawlFiles(ctx, idx.repoPath, func(ctx context.Context, filePath string) error {
|
||||
chunkID := 0
|
||||
|
||||
return chunkFile(ctx, filePath, idx.chunkSize, func(chunk []byte, start, end uint64) error {
|
||||
shouldIndex := false
|
||||
for _, ext := range allowedExtensions {
|
||||
if filepath.Ext(filePath) == ext {
|
||||
shouldIndex = true
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !shouldIndex {
|
||||
return nil
|
||||
}
|
||||
|
||||
slog.Info("indexing file", "chunk_id", chunkID, "chunk_size", len(chunk), "file_name", filePath)
|
||||
|
||||
docs := []schema.Document{{
|
||||
PageContent: string(chunk),
|
||||
Metadata: map[string]any{
|
||||
"type": "file_chunk",
|
||||
"file_path": filePath,
|
||||
"chunk_id": strconv.FormatInt(int64(chunkID), 10),
|
||||
"start": strconv.FormatUint(start, 10),
|
||||
"end": strconv.FormatUint(end, 10),
|
||||
"repo_id": idx.repoID,
|
||||
},
|
||||
}}
|
||||
|
||||
if _, err := vectorStore.AddDocuments(ctx, docs); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
chunkID++
|
||||
return nil
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// chunkFile will take a file and return it in chunks that are suitable size to be embedded.
|
||||
// This is a very simple algorithm right now, it would be better to use a lexer to identify good parts of the AST to
|
||||
// split on. We could also implement a reference graph to find the most relevant files based on the relationships
|
||||
// between files.
|
||||
func chunkFile(_ context.Context, filePath string, maxBytes int, chunkCb func(chunk []byte, start, end uint64) error) error {
|
||||
fileBytes, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
pos := 0
|
||||
for pos < len(fileBytes) {
|
||||
nextChunkSize := maxBytes
|
||||
if pos+maxBytes > len(fileBytes) {
|
||||
nextChunkSize = len(fileBytes) - pos
|
||||
}
|
||||
|
||||
if err := chunkCb(fileBytes[pos:pos+nextChunkSize], uint64(pos), uint64(pos+nextChunkSize)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
pos += maxBytes
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user