Start of Readme, More Cleanup
This commit is contained in:
@@ -5,13 +5,14 @@ import (
|
||||
"ai-code-assistant/pkg/llm"
|
||||
"context"
|
||||
"github.com/tmc/langchaingo/schema"
|
||||
"github.com/tmc/langchaingo/vectorstores/pgvector"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// Indexer is responsible for crawling the head of a Git repository and generating embeddings so that the most relevant
|
||||
// chunks of code can be identified based on a given prompt.
|
||||
type Indexer struct {
|
||||
repoPath string
|
||||
chunkSize int
|
||||
@@ -21,6 +22,7 @@ type Indexer struct {
|
||||
allowedExtensions []string
|
||||
}
|
||||
|
||||
// New creates a new Indexer.
|
||||
func New(ctx context.Context, path string, chunkSize int, force bool) *Indexer {
|
||||
return &Indexer{
|
||||
repoPath: path,
|
||||
@@ -34,6 +36,8 @@ func New(ctx context.Context, path string, chunkSize int, force bool) *Indexer {
|
||||
}
|
||||
}
|
||||
|
||||
// Index will crawl a repository looking for supported files to index and will then index them.
|
||||
// The files are indexed by the path to the repository and the Git commit hash of the current HEAD.
|
||||
func (idx *Indexer) Index(ctx context.Context) error {
|
||||
repoID, didUpdate, err := idx.db.UpsertRepo(ctx, idx.repoPath)
|
||||
if err != nil {
|
||||
@@ -46,7 +50,7 @@ func (idx *Indexer) Index(ctx context.Context) error {
|
||||
return nil
|
||||
} else if didUpdate && idx.force {
|
||||
slog.Info("repo already indexed, but force flag was set, cleaning and re-indexing")
|
||||
if err := idx.cleanIndexForRepo(ctx, repoID); err != nil {
|
||||
if err := idx.db.ClearChunkIndex(ctx, repoID); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if !didUpdate {
|
||||
@@ -60,58 +64,14 @@ func (idx *Indexer) Index(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (idx *Indexer) cleanIndexForRepo(ctx context.Context, repoID string) error {
|
||||
conn, err := idx.db.DB(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer conn.Release()
|
||||
|
||||
if _, err := conn.Exec(ctx, "clear_chunks_for_repo", repoID); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func crawlFiles(ctx context.Context, path string, cb func(ctx context.Context, filePath string) error) error {
|
||||
pathFiles, err := os.ReadDir(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, file := range pathFiles {
|
||||
filePath := filepath.Join(path, file.Name())
|
||||
|
||||
if file.IsDir() {
|
||||
if err := crawlFiles(ctx, filePath, cb); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err := cb(ctx, filePath); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// generateFileChunks crawls the repository looking for supported files and will store the embeddings for those files
|
||||
// in the database so we can look for relevant chunks later.
|
||||
func (idx *Indexer) generateFileChunks(ctx context.Context, repoID string) error {
|
||||
conn, err := idx.db.DB(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer conn.Release()
|
||||
|
||||
vectorStore, err := pgvector.New(ctx,
|
||||
pgvector.WithConn(conn),
|
||||
pgvector.WithEmbedder(idx.llm.Embedder()),
|
||||
pgvector.WithCollectionName("file_chunks"),
|
||||
)
|
||||
vectorStore, closeFunc, err := idx.db.GetVectorStore(ctx, idx.llm.Embedder())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer closeFunc()
|
||||
|
||||
return crawlFiles(ctx, idx.repoPath, func(ctx context.Context, filePath string) error {
|
||||
chunkID := 0
|
||||
@@ -154,10 +114,34 @@ func (idx *Indexer) generateFileChunks(ctx context.Context, repoID string) error
|
||||
})
|
||||
}
|
||||
|
||||
// crawlFiles recursively crawls the repository tree looking for files, when a file is located the callback is called.
|
||||
func crawlFiles(ctx context.Context, path string, cb func(ctx context.Context, filePath string) error) error {
|
||||
pathFiles, err := os.ReadDir(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, file := range pathFiles {
|
||||
filePath := filepath.Join(path, file.Name())
|
||||
|
||||
if file.IsDir() {
|
||||
if err := crawlFiles(ctx, filePath, cb); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err := cb(ctx, filePath); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// chunkFile will take a file and return it in chunks that are suitable size to be embedded.
|
||||
// This is a very simple algorithm right now, it would be better to use a lexer to identify good parts of the AST to
|
||||
// split on. We could also implement a reference graph to find the most relevant files based on the relationships
|
||||
// between files.
|
||||
// split on (e.g. functions, or groups of functions). We could also implement a reference graph to find the most
|
||||
// relevant files based on the relationships between files.
|
||||
func chunkFile(_ context.Context, filePath string, maxBytes int, chunkCb func(chunk []byte, start, end uint64) error) error {
|
||||
fileBytes, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user