Code Cleanup and Quality of Life
Checks to make sure repo is indexed before generating code. Don't generate tests for changes to tests. Remove unused code. Fix bootstrapping issue with langchaingo tables.
This commit is contained in:
182
pkg/indexer/indexer.go
Normal file
182
pkg/indexer/indexer.go
Normal file
@@ -0,0 +1,182 @@
|
||||
package indexer
|
||||
|
||||
import (
|
||||
"ai-code-assistant/pkg/database"
|
||||
"ai-code-assistant/pkg/llm"
|
||||
"context"
|
||||
"github.com/tmc/langchaingo/schema"
|
||||
"github.com/tmc/langchaingo/vectorstores/pgvector"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type Indexer struct {
|
||||
repoPath string
|
||||
chunkSize int
|
||||
force bool
|
||||
db *database.Database
|
||||
llm *llm.LLM
|
||||
allowedExtensions []string
|
||||
}
|
||||
|
||||
func New(ctx context.Context, path string, chunkSize int, force bool) *Indexer {
|
||||
return &Indexer{
|
||||
repoPath: path,
|
||||
chunkSize: chunkSize,
|
||||
force: force,
|
||||
db: database.FromContext(ctx),
|
||||
llm: llm.FromContext(ctx),
|
||||
allowedExtensions: []string{
|
||||
".go",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (idx *Indexer) Index(ctx context.Context) error {
|
||||
repoID, didUpdate, err := idx.db.UpsertRepo(ctx, idx.repoPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if didUpdate && !idx.force {
|
||||
slog.Info("repo already indexed, skipping")
|
||||
|
||||
return nil
|
||||
} else if didUpdate && idx.force {
|
||||
slog.Info("repo already indexed, but force flag was set, cleaning and re-indexing")
|
||||
if err := idx.cleanIndexForRepo(ctx, repoID); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if !didUpdate {
|
||||
slog.Info("indexing new repository", "path", idx.repoPath, "repo_id", repoID)
|
||||
}
|
||||
|
||||
if err := idx.generateFileChunks(ctx, repoID); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (idx *Indexer) cleanIndexForRepo(ctx context.Context, repoID string) error {
|
||||
conn, err := idx.db.DB(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer conn.Release()
|
||||
|
||||
if _, err := conn.Exec(ctx, "clear_chunks_for_repo", repoID); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func crawlFiles(ctx context.Context, path string, cb func(ctx context.Context, filePath string) error) error {
|
||||
pathFiles, err := os.ReadDir(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, file := range pathFiles {
|
||||
filePath := filepath.Join(path, file.Name())
|
||||
|
||||
if file.IsDir() {
|
||||
if err := crawlFiles(ctx, filePath, cb); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err := cb(ctx, filePath); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (idx *Indexer) generateFileChunks(ctx context.Context, repoID string) error {
|
||||
conn, err := idx.db.DB(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer conn.Release()
|
||||
|
||||
vectorStore, err := pgvector.New(ctx,
|
||||
pgvector.WithConn(conn),
|
||||
pgvector.WithEmbedder(idx.llm.Embedder()),
|
||||
pgvector.WithCollectionName("file_chunks"),
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return crawlFiles(ctx, idx.repoPath, func(ctx context.Context, filePath string) error {
|
||||
chunkID := 0
|
||||
|
||||
return chunkFile(ctx, filePath, idx.chunkSize, func(chunk []byte, start, end uint64) error {
|
||||
shouldIndex := false
|
||||
for _, ext := range idx.allowedExtensions {
|
||||
if filepath.Ext(filePath) == ext {
|
||||
shouldIndex = true
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !shouldIndex {
|
||||
return nil
|
||||
}
|
||||
|
||||
slog.Info("indexing file", "chunk_id", chunkID, "chunk_size", len(chunk), "file_name", filePath)
|
||||
|
||||
docs := []schema.Document{{
|
||||
PageContent: string(chunk),
|
||||
Metadata: map[string]any{
|
||||
"type": "file_chunk",
|
||||
"file_path": filePath,
|
||||
"chunk_id": strconv.FormatInt(int64(chunkID), 10),
|
||||
"start": strconv.FormatUint(start, 10),
|
||||
"end": strconv.FormatUint(end, 10),
|
||||
"repo_id": repoID,
|
||||
},
|
||||
}}
|
||||
|
||||
if _, err := vectorStore.AddDocuments(ctx, docs); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
chunkID++
|
||||
return nil
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// chunkFile will take a file and return it in chunks that are suitable size to be embedded.
|
||||
// This is a very simple algorithm right now, it would be better to use a lexer to identify good parts of the AST to
|
||||
// split on. We could also implement a reference graph to find the most relevant files based on the relationships
|
||||
// between files.
|
||||
func chunkFile(_ context.Context, filePath string, maxBytes int, chunkCb func(chunk []byte, start, end uint64) error) error {
|
||||
fileBytes, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
pos := 0
|
||||
for pos < len(fileBytes) {
|
||||
nextChunkSize := maxBytes
|
||||
if pos+maxBytes > len(fileBytes) {
|
||||
nextChunkSize = len(fileBytes) - pos
|
||||
}
|
||||
|
||||
if err := chunkCb(fileBytes[pos:pos+nextChunkSize], uint64(pos), uint64(pos+nextChunkSize)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
pos += maxBytes
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user