Code Cleanup and Quality of Life

Checks to make sure repo is indexed before generating code.
Don't generate tests for changes to tests.
Remove unused code.
Fix bootstrapping issue with langchaingo tables.
This commit is contained in:
2025-04-20 08:31:26 -04:00
parent 4b8b8132fd
commit 25f8cae8cb
9 changed files with 282 additions and 256 deletions

View File

@@ -3,6 +3,7 @@ package autopatch
import (
"ai-code-assistant/pkg/config"
"ai-code-assistant/pkg/database"
"ai-code-assistant/pkg/indexer"
"ai-code-assistant/pkg/llm"
"bytes"
"context"
@@ -48,6 +49,13 @@ func (a *agent) run(ctx context.Context, cmd *cli.Command) error {
llmRef := llm.FromContext(ctx)
a.llm = llmRef
// Make sure we're indexed.
idx := indexer.New(ctx, cmd.String("repo"), config.FromContext(ctx).IndexChunkSize, false)
if err := idx.Index(ctx); err != nil {
return err
}
// Attempt to generate the commit.
err := a.generateGitCommit(ctx, cmd.String("repo"), cmd.String("task"))
if err != nil {
return err
@@ -57,20 +65,26 @@ func (a *agent) run(ctx context.Context, cmd *cli.Command) error {
}
func (a *agent) generateGitCommit(ctx context.Context, repoPath, prompt string) error {
var affectedFiles []string
fileName, newCode, err := a.generateCodePatch(ctx, repoPath, prompt)
if err != nil {
return err
}
testFile, err := a.generateUnitTest(ctx, prompt, fileName, newCode)
if err != nil {
return err
affectedFiles = append(affectedFiles, fileName)
// If we modified a test, we don't need to generate a test.
if !strings.HasSuffix(fileName, "_test.go") {
testFile, err := a.generateUnitTest(ctx, prompt, fileName, newCode)
if err != nil {
return err
}
affectedFiles = append(affectedFiles, testFile)
}
// fileName, testFile := "/home/mpowers/Projects/simple-go-server/main.go", "/home/mpowers/Projects/simple-go-server/main_test.go"
if err := a.commit(ctx, prompt, repoPath, fileName, testFile); err != nil {
if err := a.commit(ctx, prompt, repoPath, affectedFiles...); err != nil {
return err
}
@@ -126,7 +140,7 @@ func (a *agent) generateCodePatch(ctx context.Context, repoPath, prompt string)
db := database.FromContext(ctx)
cfg := config.FromContext(ctx)
repoID, err := db.RepoIDFromPath(ctx, repoPath)
repoID, _, err := db.UpsertRepo(ctx, repoPath)
if err != nil {
return "", "", err
}

View File

@@ -40,7 +40,7 @@ func (c *chunks) run(ctx context.Context, cmd *cli.Command) error {
db := database.FromContext(ctx)
llmRef := llm.FromContext(ctx)
repoID, err := db.RepoIDFromPath(ctx, cmd.String("repo"))
repoID, _, err := db.UpsertRepo(ctx, cmd.String("repo"))
if err != nil {
return err
}
@@ -51,10 +51,6 @@ func (c *chunks) run(ctx context.Context, cmd *cli.Command) error {
return err
}
if err := relDocs.RankChunks(ctx, cmd.String("query"), chunks); err != nil {
return err
}
for _, chunk := range chunks {
slog.Info("found relevant chunk", "name", chunk.Name, "start", chunk.Start, "end", chunk.End, "score", chunk.Score, "id", chunk.ChunkID)
}

View File

@@ -1,28 +1,16 @@
package indexer
import (
"ai-code-assistant/pkg/database"
"ai-code-assistant/pkg/llm"
"ai-code-assistant/pkg/indexer"
"context"
"github.com/go-git/go-billy/v5/osfs"
"github.com/go-git/go-git/v5"
"github.com/go-git/go-git/v5/plumbing/cache"
"github.com/go-git/go-git/v5/storage/filesystem"
"github.com/google/uuid"
"github.com/tmc/langchaingo/schema"
"github.com/tmc/langchaingo/vectorstores/pgvector"
"github.com/urfave/cli/v3"
"log/slog"
"os"
"path/filepath"
"strconv"
)
func Command() *cli.Command {
return &cli.Command{
Name: "indexer",
Usage: "this command will index a local git repository to build context for the llm",
Action: (&indexer{}).run,
Action: run,
Flags: []cli.Flag{
&cli.StringFlag{
Name: "repo",
@@ -34,170 +22,20 @@ func Command() *cli.Command {
Usage: "number of bytes to chunk files into, should be roughly 4x the number of tokens",
Value: 512 * 4,
},
&cli.BoolFlag{
Name: "force",
Usage: "force re-indexing of the repository",
},
},
}
}
type indexer struct {
db *database.Database
llm *llm.LLM
repoPath string
repoID string
chunkSize int
}
func run(ctx context.Context, cmd *cli.Command) error {
idx := indexer.New(ctx, cmd.String("repo"), int(cmd.Int("chunk-size")), cmd.Bool("force"))
func (idx *indexer) run(ctx context.Context, cmd *cli.Command) error {
idx.db = database.FromContext(ctx)
idx.repoPath = cmd.String("repo")
idx.chunkSize = int(cmd.Int("chunk-size"))
idx.llm = llm.FromContext(ctx)
if err := idx.upsertRepo(ctx); err != nil {
return err
}
if err := idx.generateFileChunks(ctx); err != nil {
if err := idx.Index(ctx); err != nil {
return err
}
return nil
}
func (idx *indexer) upsertRepo(ctx context.Context) error {
gitPath := osfs.New(filepath.Join(idx.repoPath, ".git"))
gitRepo, err := git.Open(filesystem.NewStorage(gitPath, cache.NewObjectLRUDefault()), gitPath)
if err != nil {
return err
}
headRef, err := gitRepo.Head()
if err != nil {
return err
}
conn, err := idx.db.DB(ctx)
if err != nil {
return err
}
defer conn.Release()
id := uuid.NewString()
if _, err := conn.Exec(ctx, "insert_repo", id, headRef.Hash().String(), idx.repoPath); err != nil {
return err
}
idx.repoID = id
return nil
}
func crawlFiles(ctx context.Context, path string, cb func(ctx context.Context, filePath string) error) error {
pathFiles, err := os.ReadDir(path)
if err != nil {
return err
}
for _, file := range pathFiles {
filePath := filepath.Join(path, file.Name())
if file.IsDir() {
if err := crawlFiles(ctx, filePath, cb); err != nil {
return err
}
} else {
if err := cb(ctx, filePath); err != nil {
return err
}
}
}
return nil
}
func (idx *indexer) generateFileChunks(ctx context.Context) error {
conn, err := idx.db.DB(ctx)
if err != nil {
return err
}
defer conn.Release()
vectorStore, err := pgvector.New(ctx,
pgvector.WithConn(conn),
pgvector.WithEmbedder(idx.llm.Embedder()),
pgvector.WithCollectionName("file_chunks"),
)
if err != nil {
return err
}
allowedExtensions := []string{".go"}
return crawlFiles(ctx, idx.repoPath, func(ctx context.Context, filePath string) error {
chunkID := 0
return chunkFile(ctx, filePath, idx.chunkSize, func(chunk []byte, start, end uint64) error {
shouldIndex := false
for _, ext := range allowedExtensions {
if filepath.Ext(filePath) == ext {
shouldIndex = true
break
}
}
if !shouldIndex {
return nil
}
slog.Info("indexing file", "chunk_id", chunkID, "chunk_size", len(chunk), "file_name", filePath)
docs := []schema.Document{{
PageContent: string(chunk),
Metadata: map[string]any{
"type": "file_chunk",
"file_path": filePath,
"chunk_id": strconv.FormatInt(int64(chunkID), 10),
"start": strconv.FormatUint(start, 10),
"end": strconv.FormatUint(end, 10),
"repo_id": idx.repoID,
},
}}
if _, err := vectorStore.AddDocuments(ctx, docs); err != nil {
return err
}
chunkID++
return nil
})
})
}
// chunkFile will take a file and return it in chunks that are suitable size to be embedded.
// This is a very simple algorithm right now, it would be better to use a lexer to identify good parts of the AST to
// split on. We could also implement a reference graph to find the most relevant files based on the relationships
// between files.
func chunkFile(_ context.Context, filePath string, maxBytes int, chunkCb func(chunk []byte, start, end uint64) error) error {
fileBytes, err := os.ReadFile(filePath)
if err != nil {
return err
}
pos := 0
for pos < len(fileBytes) {
nextChunkSize := maxBytes
if pos+maxBytes > len(fileBytes) {
nextChunkSize = len(fileBytes) - pos
}
if err := chunkCb(fileBytes[pos:pos+nextChunkSize], uint64(pos), uint64(pos+nextChunkSize)); err != nil {
return err
}
pos += maxBytes
}
return nil
}

View File

@@ -73,6 +73,14 @@ func readConfig(ctx context.Context, cmd *cli.Command) (context.Context, error)
return nil, fmt.Errorf("problem parsing config: %w", err)
}
if cfg.IndexChunkSize == 0 {
cfg.IndexChunkSize = 512 * 4
}
if cfg.RelevantDocs == 0 {
cfg.RelevantDocs = 5
}
return config.WrapContext(ctx, cfg), nil
}
@@ -84,7 +92,10 @@ func initLogging(ctx context.Context, _ *cli.Command) (context.Context, error) {
return nil, err
}
slog.SetLogLoggerLevel(lvl)
handler := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: lvl,
}))
slog.SetDefault(handler)
return ctx, nil
}