initial commit
This commit is contained in:
81
internal/config/config.go
Normal file
81
internal/config/config.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
)
|
||||
|
||||
// Config represents the application configuration
|
||||
type Config struct {
|
||||
DataSource DataSourceConfig `json:"data_source"`
|
||||
LLM LLMConfig `json:"llm"`
|
||||
OutputDir string `json:"output_dir"`
|
||||
BatchSize int `json:"batch_size"`
|
||||
APIKey string `json:"api_key"`
|
||||
}
|
||||
|
||||
// DataSourceConfig contains configuration for data source
|
||||
type DataSourceConfig struct {
|
||||
URL string `json:"url"`
|
||||
}
|
||||
|
||||
// LLMConfig contains configuration for LLM provider
|
||||
type LLMConfig struct {
|
||||
Provider string `json:"provider"` // "openrouter" or "lmstudio"
|
||||
Model string `json:"model"`
|
||||
APIKey string `json:"api_key,omitempty"` // For OpenRouter
|
||||
BaseURL string `json:"base_url,omitempty"` // For LM Studio
|
||||
}
|
||||
|
||||
// Load loads configuration from a JSON file
|
||||
func Load(path string) (*Config, error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading config file: %w", err)
|
||||
}
|
||||
|
||||
var cfg Config
|
||||
if err := json.Unmarshal(data, &cfg); err != nil {
|
||||
return nil, fmt.Errorf("parsing config file: %w", err)
|
||||
}
|
||||
|
||||
return &cfg, nil
|
||||
}
|
||||
|
||||
// Validate validates the configuration
|
||||
func (c *Config) Validate() error {
|
||||
if c.DataSource.URL == "" {
|
||||
return fmt.Errorf("data_source.url is required")
|
||||
}
|
||||
|
||||
if c.LLM.Provider != "openrouter" && c.LLM.Provider != "lmstudio" {
|
||||
return fmt.Errorf("llm.provider must be 'openrouter' or 'lmstudio'")
|
||||
}
|
||||
|
||||
if c.LLM.Model == "" {
|
||||
return fmt.Errorf("llm.model is required")
|
||||
}
|
||||
|
||||
if c.LLM.Provider == "openrouter" && c.LLM.APIKey == "" {
|
||||
return fmt.Errorf("llm.api_key is required for openrouter provider")
|
||||
}
|
||||
|
||||
if c.LLM.Provider == "lmstudio" && c.LLM.BaseURL == "" {
|
||||
return fmt.Errorf("llm.base_url is required for lmstudio provider")
|
||||
}
|
||||
|
||||
if c.OutputDir == "" {
|
||||
return fmt.Errorf("output_dir is required")
|
||||
}
|
||||
|
||||
if c.BatchSize <= 0 {
|
||||
return fmt.Errorf("batch_size must be greater than 0")
|
||||
}
|
||||
|
||||
if c.APIKey == "" {
|
||||
return fmt.Errorf("api_key is required")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
19
internal/domain/llm.go
Normal file
19
internal/domain/llm.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package domain
|
||||
|
||||
import "context"
|
||||
|
||||
// LLMRequest represents a request to the LLM
|
||||
type LLMRequest struct {
|
||||
Prompt string
|
||||
Schema interface{} // JSON schema for structured output
|
||||
}
|
||||
|
||||
// LLMResponse represents the response from the LLM
|
||||
type LLMResponse struct {
|
||||
Content string
|
||||
}
|
||||
|
||||
// LLMProvider defines the interface for LLM providers
|
||||
type LLMProvider interface {
|
||||
Complete(ctx context.Context, request LLMRequest) (*LLMResponse, error)
|
||||
}
|
||||
38
internal/domain/person.go
Normal file
38
internal/domain/person.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package domain
|
||||
|
||||
// Person represents a person with their details
|
||||
type Person struct {
|
||||
Name string `json:"name"`
|
||||
Surname string `json:"surname"`
|
||||
Gender string `json:"gender"`
|
||||
Born int `json:"born"`
|
||||
City string `json:"city"`
|
||||
Job string `json:"-"` // Not exported to JSON, used only for categorization
|
||||
Tags []string `json:"tags"`
|
||||
}
|
||||
|
||||
// Tag represents available categorization tags
|
||||
type Tag string
|
||||
|
||||
const (
|
||||
TagIT Tag = "IT"
|
||||
TagTransport Tag = "transport"
|
||||
TagEdukacja Tag = "edukacja"
|
||||
TagMedycyna Tag = "medycyna"
|
||||
TagPracaZLudźmi Tag = "praca z ludźmi"
|
||||
TagPracaZPojazdami Tag = "praca z pojazdami"
|
||||
TagPracaFizyczna Tag = "praca fizyczna"
|
||||
)
|
||||
|
||||
// AvailableTags returns all available tags
|
||||
func AvailableTags() []Tag {
|
||||
return []Tag{
|
||||
TagIT,
|
||||
TagTransport,
|
||||
TagEdukacja,
|
||||
TagMedycyna,
|
||||
TagPracaZLudźmi,
|
||||
TagPracaZPojazdami,
|
||||
TagPracaFizyczna,
|
||||
}
|
||||
}
|
||||
8
internal/domain/repository.go
Normal file
8
internal/domain/repository.go
Normal file
@@ -0,0 +1,8 @@
|
||||
package domain
|
||||
|
||||
import "context"
|
||||
|
||||
// PersonRepository defines the interface for fetching person data
|
||||
type PersonRepository interface {
|
||||
FetchPeople(ctx context.Context, url string) ([]Person, error)
|
||||
}
|
||||
116
internal/infrastructure/csv/repository.go
Normal file
116
internal/infrastructure/csv/repository.go
Normal file
@@ -0,0 +1,116 @@
|
||||
package csv
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/paramah/ai_devs4/s01e01/internal/domain"
|
||||
)
|
||||
|
||||
// Repository implements domain.PersonRepository
|
||||
type Repository struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// NewRepository creates a new CSV repository
|
||||
func NewRepository() *Repository {
|
||||
return &Repository{
|
||||
client: &http.Client{},
|
||||
}
|
||||
}
|
||||
|
||||
// FetchPeople fetches people from a CSV file at the given URL
|
||||
func (r *Repository) FetchPeople(ctx context.Context, url string) ([]domain.Person, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating request: %w", err)
|
||||
}
|
||||
|
||||
resp, err := r.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("fetching CSV: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
reader := csv.NewReader(resp.Body)
|
||||
|
||||
// Read header
|
||||
header, err := reader.Read()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading header: %w", err)
|
||||
}
|
||||
|
||||
// Parse header to get column indices
|
||||
indices := make(map[string]int)
|
||||
for i, col := range header {
|
||||
indices[strings.TrimSpace(col)] = i
|
||||
}
|
||||
|
||||
var people []domain.Person
|
||||
for {
|
||||
record, err := reader.Read()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading record: %w", err)
|
||||
}
|
||||
|
||||
person, err := r.parsePerson(record, indices)
|
||||
if err != nil {
|
||||
// Skip invalid records
|
||||
continue
|
||||
}
|
||||
|
||||
people = append(people, person)
|
||||
}
|
||||
|
||||
return people, nil
|
||||
}
|
||||
|
||||
func (r *Repository) parsePerson(record []string, indices map[string]int) (domain.Person, error) {
|
||||
var person domain.Person
|
||||
|
||||
if idx, ok := indices["name"]; ok && idx < len(record) {
|
||||
person.Name = strings.TrimSpace(record[idx])
|
||||
}
|
||||
|
||||
if idx, ok := indices["surname"]; ok && idx < len(record) {
|
||||
person.Surname = strings.TrimSpace(record[idx])
|
||||
}
|
||||
|
||||
if idx, ok := indices["gender"]; ok && idx < len(record) {
|
||||
person.Gender = strings.TrimSpace(record[idx])
|
||||
}
|
||||
|
||||
// Parse birthDate (format: YYYY-MM-DD) to extract year
|
||||
if idx, ok := indices["birthDate"]; ok && idx < len(record) {
|
||||
birthDate := strings.TrimSpace(record[idx])
|
||||
if len(birthDate) >= 4 {
|
||||
if year, err := strconv.Atoi(birthDate[:4]); err == nil {
|
||||
person.Born = year
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use birthPlace as city
|
||||
if idx, ok := indices["birthPlace"]; ok && idx < len(record) {
|
||||
person.City = strings.TrimSpace(record[idx])
|
||||
}
|
||||
|
||||
// Read job description
|
||||
if idx, ok := indices["job"]; ok && idx < len(record) {
|
||||
person.Job = strings.TrimSpace(record[idx])
|
||||
}
|
||||
|
||||
return person, nil
|
||||
}
|
||||
130
internal/infrastructure/llm/lmstudio.go
Normal file
130
internal/infrastructure/llm/lmstudio.go
Normal file
@@ -0,0 +1,130 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
|
||||
"github.com/paramah/ai_devs4/s01e01/internal/domain"
|
||||
)
|
||||
|
||||
// LMStudioProvider implements domain.LLMProvider for local LM Studio
|
||||
type LMStudioProvider struct {
|
||||
baseURL string
|
||||
model string
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// NewLMStudioProvider creates a new LM Studio provider
|
||||
func NewLMStudioProvider(baseURL, model string) *LMStudioProvider {
|
||||
return &LMStudioProvider{
|
||||
baseURL: baseURL,
|
||||
model: model,
|
||||
client: &http.Client{},
|
||||
}
|
||||
}
|
||||
|
||||
type lmStudioRequest struct {
|
||||
Model string `json:"model"`
|
||||
Messages []map[string]interface{} `json:"messages"`
|
||||
ResponseFormat *lmResponseFormat `json:"response_format,omitempty"`
|
||||
Temperature float64 `json:"temperature,omitempty"`
|
||||
}
|
||||
|
||||
type lmResponseFormat struct {
|
||||
Type string `json:"type"`
|
||||
JSONSchema interface{} `json:"json_schema,omitempty"`
|
||||
}
|
||||
|
||||
type lmStudioResponse struct {
|
||||
Choices []struct {
|
||||
Message struct {
|
||||
Content string `json:"content"`
|
||||
} `json:"message"`
|
||||
} `json:"choices"`
|
||||
Error json.RawMessage `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// Complete sends a request to LM Studio local server
|
||||
func (p *LMStudioProvider) Complete(ctx context.Context, request domain.LLMRequest) (*domain.LLMResponse, error) {
|
||||
reqBody := lmStudioRequest{
|
||||
Model: p.model,
|
||||
Messages: []map[string]interface{}{
|
||||
{
|
||||
"role": "user",
|
||||
"content": request.Prompt,
|
||||
},
|
||||
},
|
||||
Temperature: 0.7,
|
||||
}
|
||||
|
||||
if request.Schema != nil {
|
||||
reqBody.ResponseFormat = &lmResponseFormat{
|
||||
Type: "json_schema",
|
||||
JSONSchema: request.Schema,
|
||||
}
|
||||
}
|
||||
|
||||
jsonData, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshaling request: %w", err)
|
||||
}
|
||||
|
||||
url := p.baseURL + "/v1/chat/completions"
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewBuffer(jsonData))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := p.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("sending request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading response: %w", err)
|
||||
}
|
||||
|
||||
// Check HTTP status
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
var apiResp lmStudioResponse
|
||||
if err := json.Unmarshal(body, &apiResp); err != nil {
|
||||
return nil, fmt.Errorf("unmarshaling response: %w\nResponse body: %s", err, string(body))
|
||||
}
|
||||
|
||||
// Check for error in response
|
||||
if len(apiResp.Error) > 0 {
|
||||
// Try to parse as string
|
||||
var errStr string
|
||||
if err := json.Unmarshal(apiResp.Error, &errStr); err == nil {
|
||||
return nil, fmt.Errorf("API error: %s", errStr)
|
||||
}
|
||||
// Try to parse as object with message field
|
||||
var errObj struct {
|
||||
Message string `json:"message"`
|
||||
}
|
||||
if err := json.Unmarshal(apiResp.Error, &errObj); err == nil {
|
||||
return nil, fmt.Errorf("API error: %s", errObj.Message)
|
||||
}
|
||||
// Fallback to raw error
|
||||
return nil, fmt.Errorf("API error: %s", string(apiResp.Error))
|
||||
}
|
||||
|
||||
if len(apiResp.Choices) == 0 {
|
||||
return nil, fmt.Errorf("no choices in response. Response body: %s", string(body))
|
||||
}
|
||||
|
||||
return &domain.LLMResponse{
|
||||
Content: apiResp.Choices[0].Message.Content,
|
||||
}, nil
|
||||
}
|
||||
113
internal/infrastructure/llm/openrouter.go
Normal file
113
internal/infrastructure/llm/openrouter.go
Normal file
@@ -0,0 +1,113 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
|
||||
"github.com/paramah/ai_devs4/s01e01/internal/domain"
|
||||
)
|
||||
|
||||
// OpenRouterProvider implements domain.LLMProvider for OpenRouter API
|
||||
type OpenRouterProvider struct {
|
||||
apiKey string
|
||||
model string
|
||||
baseURL string
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// NewOpenRouterProvider creates a new OpenRouter provider
|
||||
func NewOpenRouterProvider(apiKey, model string) *OpenRouterProvider {
|
||||
return &OpenRouterProvider{
|
||||
apiKey: apiKey,
|
||||
model: model,
|
||||
baseURL: "https://openrouter.ai/api/v1/chat/completions",
|
||||
client: &http.Client{},
|
||||
}
|
||||
}
|
||||
|
||||
type openRouterRequest struct {
|
||||
Model string `json:"model"`
|
||||
Messages []map[string]interface{} `json:"messages"`
|
||||
ResponseFormat *responseFormat `json:"response_format,omitempty"`
|
||||
}
|
||||
|
||||
type responseFormat struct {
|
||||
Type string `json:"type"`
|
||||
JSONSchema interface{} `json:"json_schema,omitempty"`
|
||||
}
|
||||
|
||||
type openRouterResponse struct {
|
||||
Choices []struct {
|
||||
Message struct {
|
||||
Content string `json:"content"`
|
||||
} `json:"message"`
|
||||
} `json:"choices"`
|
||||
Error *struct {
|
||||
Message string `json:"message"`
|
||||
} `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// Complete sends a request to OpenRouter API
|
||||
func (p *OpenRouterProvider) Complete(ctx context.Context, request domain.LLMRequest) (*domain.LLMResponse, error) {
|
||||
reqBody := openRouterRequest{
|
||||
Model: p.model,
|
||||
Messages: []map[string]interface{}{
|
||||
{
|
||||
"role": "user",
|
||||
"content": request.Prompt,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
if request.Schema != nil {
|
||||
reqBody.ResponseFormat = &responseFormat{
|
||||
Type: "json_schema",
|
||||
JSONSchema: request.Schema,
|
||||
}
|
||||
}
|
||||
|
||||
jsonData, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshaling request: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.baseURL, bytes.NewBuffer(jsonData))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", "Bearer "+p.apiKey)
|
||||
|
||||
resp, err := p.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("sending request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading response: %w", err)
|
||||
}
|
||||
|
||||
var apiResp openRouterResponse
|
||||
if err := json.Unmarshal(body, &apiResp); err != nil {
|
||||
return nil, fmt.Errorf("unmarshaling response: %w", err)
|
||||
}
|
||||
|
||||
if apiResp.Error != nil {
|
||||
return nil, fmt.Errorf("API error: %s", apiResp.Error.Message)
|
||||
}
|
||||
|
||||
if len(apiResp.Choices) == 0 {
|
||||
return nil, fmt.Errorf("no choices in response")
|
||||
}
|
||||
|
||||
return &domain.LLMResponse{
|
||||
Content: apiResp.Choices[0].Message.Content,
|
||||
}, nil
|
||||
}
|
||||
467
internal/usecase/categorize.go
Normal file
467
internal/usecase/categorize.go
Normal file
@@ -0,0 +1,467 @@
|
||||
package usecase
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/paramah/ai_devs4/s01e01/internal/domain"
|
||||
)
|
||||
|
||||
// CategorizeUseCase handles the categorization of people
|
||||
type CategorizeUseCase struct {
|
||||
personRepo domain.PersonRepository
|
||||
llmProvider domain.LLMProvider
|
||||
outputDir string
|
||||
batchSize int
|
||||
}
|
||||
|
||||
// NewCategorizeUseCase creates a new categorize use case
|
||||
func NewCategorizeUseCase(repo domain.PersonRepository, llm domain.LLMProvider, outputDir string, batchSize int) *CategorizeUseCase {
|
||||
return &CategorizeUseCase{
|
||||
personRepo: repo,
|
||||
llmProvider: llm,
|
||||
outputDir: outputDir,
|
||||
batchSize: batchSize,
|
||||
}
|
||||
}
|
||||
|
||||
// Execute fetches people and categorizes them using LLM
|
||||
func (uc *CategorizeUseCase) Execute(ctx context.Context, dataURL string) ([]domain.Person, error) {
|
||||
// Create output directory if it doesn't exist
|
||||
if err := os.MkdirAll(uc.outputDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("creating output directory: %w", err)
|
||||
}
|
||||
|
||||
// Fetch people from data source
|
||||
allPeople, err := uc.personRepo.FetchPeople(ctx, dataURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("fetching people: %w", err)
|
||||
}
|
||||
|
||||
originalCount := len(allPeople)
|
||||
fmt.Printf("\n[%s] ========== DATA FILTERING ==========\n", time.Now().Format("2006-01-02 15:04:05"))
|
||||
fmt.Printf("[%s] Original CSV entries: %d\n", time.Now().Format("2006-01-02 15:04:05"), originalCount)
|
||||
|
||||
if originalCount == 0 {
|
||||
return []domain.Person{}, nil
|
||||
}
|
||||
|
||||
fmt.Printf("[%s] Applying filters:\n", time.Now().Format("2006-01-02 15:04:05"))
|
||||
fmt.Printf(" - Gender: M (male)\n")
|
||||
fmt.Printf(" - Age in 2026: 20-40 years (born 1986-2006)\n")
|
||||
fmt.Printf(" - City: Grudziądz\n")
|
||||
fmt.Printf(" - Industry: ALL (will be categorized by LLM)\n\n")
|
||||
|
||||
// Filter people - keep only those matching criteria
|
||||
people := uc.filterCompletePeople(allPeople)
|
||||
filteredCount := len(people)
|
||||
|
||||
fmt.Printf("[%s] Filtered entries (matching criteria): %d\n", time.Now().Format("2006-01-02 15:04:05"), filteredCount)
|
||||
fmt.Printf("[%s] Removed entries: %d\n", time.Now().Format("2006-01-02 15:04:05"), originalCount-filteredCount)
|
||||
fmt.Printf("[%s] =====================================\n\n", time.Now().Format("2006-01-02 15:04:05"))
|
||||
|
||||
// Save filtered data to file
|
||||
if err := uc.saveFilteredData(people); err != nil {
|
||||
return nil, fmt.Errorf("saving filtered data: %w", err)
|
||||
}
|
||||
|
||||
if filteredCount == 0 {
|
||||
fmt.Printf("[%s] No complete entries to process\n", time.Now().Format("2006-01-02 15:04:05"))
|
||||
return []domain.Person{}, nil
|
||||
}
|
||||
|
||||
// Process in batches
|
||||
totalPeople := len(people)
|
||||
totalBatches := (totalPeople + uc.batchSize - 1) / uc.batchSize
|
||||
startTime := time.Now()
|
||||
processedBatches := 0
|
||||
skippedBatches := 0
|
||||
|
||||
for i := 0; i < totalPeople; i += uc.batchSize {
|
||||
batchNum := i/uc.batchSize + 1
|
||||
batchStart := time.Now()
|
||||
|
||||
end := i + uc.batchSize
|
||||
if end > totalPeople {
|
||||
end = totalPeople
|
||||
}
|
||||
|
||||
batch := people[i:end]
|
||||
|
||||
// Generate filename for this batch
|
||||
batchFilename := fmt.Sprintf("batch_%d_%d.json", i, end-1)
|
||||
batchFilepath := filepath.Join(uc.outputDir, batchFilename)
|
||||
|
||||
// Check if batch already processed
|
||||
if _, err := os.Stat(batchFilepath); err == nil {
|
||||
skippedBatches++
|
||||
fmt.Printf("[%s] Skipping batch %d/%d (entries %d-%d, already processed)\n",
|
||||
time.Now().Format("2006-01-02 15:04:05"),
|
||||
batchNum,
|
||||
totalBatches,
|
||||
i,
|
||||
end-1)
|
||||
continue
|
||||
}
|
||||
|
||||
// Calculate ETA
|
||||
var etaStr string
|
||||
if processedBatches > 0 {
|
||||
elapsed := time.Since(startTime)
|
||||
avgTimePerBatch := elapsed / time.Duration(processedBatches)
|
||||
remainingBatches := totalBatches - batchNum
|
||||
eta := avgTimePerBatch * time.Duration(remainingBatches)
|
||||
etaStr = fmt.Sprintf(" (ETA: %s)", eta.Round(time.Second))
|
||||
}
|
||||
|
||||
fmt.Printf("[%s] Processing batch %d/%d (entries %d-%d, %d people)...%s\n",
|
||||
time.Now().Format("2006-01-02 15:04:05"),
|
||||
batchNum,
|
||||
totalBatches,
|
||||
i,
|
||||
end-1,
|
||||
len(batch),
|
||||
etaStr)
|
||||
|
||||
// Prepare prompt for LLM
|
||||
prompt, schema := uc.buildPrompt(batch)
|
||||
|
||||
// Send to LLM for categorization
|
||||
response, err := uc.llmProvider.Complete(ctx, domain.LLMRequest{
|
||||
Prompt: prompt,
|
||||
Schema: schema,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("LLM completion (batch %d): %w", batchNum, err)
|
||||
}
|
||||
|
||||
// Log the raw model response for debugging
|
||||
fmt.Printf("[%s] Raw response:\n%s\n",
|
||||
time.Now().Format("2006-01-02 15:04:05"),
|
||||
response.Content)
|
||||
|
||||
// Parse the response
|
||||
categorizedBatch, err := uc.parseBatchResponse(response.Content)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parsing LLM response (batch %d): %w\nRaw response: %s", batchNum, err, response.Content)
|
||||
}
|
||||
|
||||
// Save batch to file
|
||||
if err := uc.saveBatchToFile(categorizedBatch, batchFilepath); err != nil {
|
||||
return nil, fmt.Errorf("saving batch to file: %w", err)
|
||||
}
|
||||
|
||||
processedBatches++
|
||||
batchDuration := time.Since(batchStart)
|
||||
fmt.Printf("[%s] Batch %d completed in %s (%d people)\n",
|
||||
time.Now().Format("2006-01-02 15:04:05"),
|
||||
batchNum,
|
||||
batchDuration.Round(time.Second),
|
||||
len(categorizedBatch))
|
||||
}
|
||||
|
||||
fmt.Printf("\n[%s] Summary: Processed %d batches, Skipped %d (already done), Total %d batches\n",
|
||||
time.Now().Format("2006-01-02 15:04:05"),
|
||||
processedBatches,
|
||||
skippedBatches,
|
||||
totalBatches)
|
||||
|
||||
// Collect all results from files
|
||||
return uc.collectResults()
|
||||
}
|
||||
|
||||
// saveBatchToFile saves a batch of people to a JSON file
|
||||
func (uc *CategorizeUseCase) saveBatchToFile(people []domain.Person, filepath string) error {
|
||||
data, err := json.MarshalIndent(people, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshaling batch: %w", err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath, data, 0644); err != nil {
|
||||
return fmt.Errorf("writing file: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// filterCompletePeople filters people based on specific criteria
|
||||
func (uc *CategorizeUseCase) filterCompletePeople(people []domain.Person) []domain.Person {
|
||||
const currentYear = 2026
|
||||
const minAge = 20
|
||||
const maxAge = 40
|
||||
|
||||
// Calculate birth year range
|
||||
minBornYear := currentYear - maxAge // 1986
|
||||
maxBornYear := currentYear - minAge // 2006
|
||||
|
||||
var filtered []domain.Person
|
||||
for _, person := range people {
|
||||
// Basic data completeness check
|
||||
if person.Gender == "" || person.Born == 0 || person.City == "" || person.Job == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Apply specific filters:
|
||||
// 1. Gender: Male (M)
|
||||
if person.Gender != "M" {
|
||||
continue
|
||||
}
|
||||
|
||||
// 2. Age: between 20-40 years in 2026 (born between 1986-2006)
|
||||
if person.Born < minBornYear || person.Born > maxBornYear {
|
||||
continue
|
||||
}
|
||||
|
||||
// 3. City: Grudziądz
|
||||
if person.City != "Grudziądz" {
|
||||
continue
|
||||
}
|
||||
|
||||
// 4. Industry: transport-related (DISABLED - LLM will categorize)
|
||||
// if !uc.isTransportJob(person.Job) {
|
||||
// continue
|
||||
// }
|
||||
|
||||
filtered = append(filtered, person)
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
// isTransportJob checks if job description is related to transport industry
|
||||
func (uc *CategorizeUseCase) isTransportJob(jobDescription string) bool {
|
||||
jobLower := strings.ToLower(jobDescription)
|
||||
|
||||
transportKeywords := []string{
|
||||
"transport",
|
||||
"pojazd",
|
||||
"samochód",
|
||||
"kierow",
|
||||
"prowadz", // prowadzenie pojazdu
|
||||
"dostaw",
|
||||
"przewóz",
|
||||
"logistyk",
|
||||
"auto",
|
||||
"ciężar", // ciężarówka
|
||||
"bus",
|
||||
"tir",
|
||||
"wagon",
|
||||
"pojazd",
|
||||
"ruch",
|
||||
"droga",
|
||||
"trasa",
|
||||
"przesył",
|
||||
}
|
||||
|
||||
for _, keyword := range transportKeywords {
|
||||
if strings.Contains(jobLower, keyword) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// saveFilteredData saves filtered people data to a JSON file
|
||||
func (uc *CategorizeUseCase) saveFilteredData(people []domain.Person) error {
|
||||
// Create a version without Job field for saving
|
||||
type PersonForSave struct {
|
||||
Name string `json:"name"`
|
||||
Surname string `json:"surname"`
|
||||
Gender string `json:"gender"`
|
||||
Born int `json:"born"`
|
||||
City string `json:"city"`
|
||||
Job string `json:"job"`
|
||||
}
|
||||
|
||||
peopleForSave := make([]PersonForSave, len(people))
|
||||
for i, p := range people {
|
||||
peopleForSave[i] = PersonForSave{
|
||||
Name: p.Name,
|
||||
Surname: p.Surname,
|
||||
Gender: p.Gender,
|
||||
Born: p.Born,
|
||||
City: p.City,
|
||||
Job: p.Job,
|
||||
}
|
||||
}
|
||||
|
||||
data, err := json.MarshalIndent(peopleForSave, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshaling filtered data: %w", err)
|
||||
}
|
||||
|
||||
filePath := filepath.Join(uc.outputDir, "filtered_people.json")
|
||||
if err := os.WriteFile(filePath, data, 0644); err != nil {
|
||||
return fmt.Errorf("writing filtered data file: %w", err)
|
||||
}
|
||||
|
||||
fmt.Printf("[%s] Filtered data saved to: %s\n\n", time.Now().Format("2006-01-02 15:04:05"), filePath)
|
||||
return nil
|
||||
}
|
||||
|
||||
// collectResults reads all batch files and returns them as a slice
|
||||
func (uc *CategorizeUseCase) collectResults() ([]domain.Person, error) {
|
||||
files, err := os.ReadDir(uc.outputDir)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading output directory: %w", err)
|
||||
}
|
||||
|
||||
var allPeople []domain.Person
|
||||
for _, file := range files {
|
||||
// Skip filtered_people.json and only process batch files
|
||||
if file.IsDir() || filepath.Ext(file.Name()) != ".json" || file.Name() == "filtered_people.json" {
|
||||
continue
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(filepath.Join(uc.outputDir, file.Name()))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading file %s: %w", file.Name(), err)
|
||||
}
|
||||
|
||||
var batch []domain.Person
|
||||
if err := json.Unmarshal(data, &batch); err != nil {
|
||||
return nil, fmt.Errorf("unmarshaling file %s: %w", file.Name(), err)
|
||||
}
|
||||
|
||||
allPeople = append(allPeople, batch...)
|
||||
}
|
||||
|
||||
return allPeople, nil
|
||||
}
|
||||
|
||||
func (uc *CategorizeUseCase) buildPrompt(batch []domain.Person) (string, interface{}) {
|
||||
// Create a version with job descriptions for the prompt
|
||||
type PersonWithJob struct {
|
||||
Name string `json:"name"`
|
||||
Surname string `json:"surname"`
|
||||
Gender string `json:"gender"`
|
||||
Born int `json:"born"`
|
||||
City string `json:"city"`
|
||||
Job string `json:"job"`
|
||||
}
|
||||
|
||||
batchWithJobs := make([]PersonWithJob, len(batch))
|
||||
for i, p := range batch {
|
||||
batchWithJobs[i] = PersonWithJob{
|
||||
Name: p.Name,
|
||||
Surname: p.Surname,
|
||||
Gender: p.Gender,
|
||||
Born: p.Born,
|
||||
City: p.City,
|
||||
Job: p.Job,
|
||||
}
|
||||
}
|
||||
|
||||
batchJSON, _ := json.Marshal(batchWithJobs)
|
||||
|
||||
availableTags := domain.AvailableTags()
|
||||
tagsJSON, _ := json.Marshal(availableTags)
|
||||
|
||||
prompt := fmt.Sprintf(`Categorize the following people based on their job descriptions.
|
||||
Each person should be assigned one or more appropriate tags from the available list based on their job field.
|
||||
|
||||
Available tags: %s
|
||||
|
||||
People to categorize (each person has name, surname, gender, born year, city, and job description):
|
||||
%s
|
||||
|
||||
CRITICAL INSTRUCTIONS:
|
||||
1. YOU MUST PROCESS ALL %d PEOPLE IN THE INPUT - NOT JUST A FEW!
|
||||
2. Read EVERY person's job description carefully
|
||||
3. Assign 1-3 relevant tags from the available list based on what the job description says
|
||||
4. Tag mapping guidelines:
|
||||
- "IT" - for programming, software development, algorithms, data structures, technology
|
||||
- "transport" - for driving, vehicle operation, logistics
|
||||
- "edukacja" - for teaching, training, education, development of skills
|
||||
- "medycyna" - for healthcare, doctors, nurses, medical diagnosis, treatment
|
||||
- "praca z ludźmi" - for jobs involving direct work with people (teaching, healthcare, consulting, etc.)
|
||||
- "praca z pojazdami" - for mechanics, vehicle repair, automotive work
|
||||
- "praca fizyczna" - for manual labor, construction, carpentry, physical work
|
||||
5. Many jobs can have multiple tags (e.g., a teacher = "edukacja" + "praca z ludźmi")
|
||||
6. Return a complete JSON array with ALL %d people
|
||||
7. Each person object must have: name, surname, gender, born, city, tags
|
||||
8. Do NOT include the job description in the output
|
||||
9. No explanations, no markdown formatting, just the JSON array`, string(tagsJSON), string(batchJSON), len(batch), len(batch))
|
||||
|
||||
// JSON Schema for structured output
|
||||
schema := map[string]interface{}{
|
||||
"name": "categorize_people",
|
||||
"schema": map[string]interface{}{
|
||||
"type": "array",
|
||||
"items": map[string]interface{}{
|
||||
"type": "object",
|
||||
"properties": map[string]interface{}{
|
||||
"name": map[string]interface{}{
|
||||
"type": "string",
|
||||
},
|
||||
"surname": map[string]interface{}{
|
||||
"type": "string",
|
||||
},
|
||||
"gender": map[string]interface{}{
|
||||
"type": "string",
|
||||
},
|
||||
"born": map[string]interface{}{
|
||||
"type": "integer",
|
||||
},
|
||||
"city": map[string]interface{}{
|
||||
"type": "string",
|
||||
},
|
||||
"tags": map[string]interface{}{
|
||||
"type": "array",
|
||||
"items": map[string]interface{}{
|
||||
"type": "string",
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": []string{"name", "surname", "gender", "born", "city", "tags"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
return prompt, schema
|
||||
}
|
||||
|
||||
func (uc *CategorizeUseCase) parseBatchResponse(content string) ([]domain.Person, error) {
|
||||
// Clean up the response - extract JSON
|
||||
cleanContent := uc.cleanJSONResponse(content)
|
||||
|
||||
var people []domain.Person
|
||||
if err := json.Unmarshal([]byte(cleanContent), &people); err != nil {
|
||||
return nil, fmt.Errorf("unmarshaling response: %w", err)
|
||||
}
|
||||
|
||||
return people, nil
|
||||
}
|
||||
|
||||
// cleanJSONResponse extracts JSON from response that may contain extra text
|
||||
func (uc *CategorizeUseCase) cleanJSONResponse(content string) string {
|
||||
// Trim whitespace
|
||||
content = strings.TrimSpace(content)
|
||||
|
||||
// Try to find JSON array first [...]
|
||||
arrayStart := strings.Index(content, "[")
|
||||
arrayEnd := strings.LastIndex(content, "]")
|
||||
|
||||
// Try to find JSON object {...}
|
||||
objectStart := strings.Index(content, "{")
|
||||
objectEnd := strings.LastIndex(content, "}")
|
||||
|
||||
// Use whichever comes first
|
||||
if arrayStart != -1 && (objectStart == -1 || arrayStart < objectStart) {
|
||||
if arrayEnd != -1 && arrayEnd > arrayStart {
|
||||
return strings.TrimSpace(content[arrayStart : arrayEnd+1])
|
||||
}
|
||||
}
|
||||
|
||||
if objectStart != -1 && objectEnd != -1 && objectStart < objectEnd {
|
||||
return strings.TrimSpace(content[objectStart : objectEnd+1])
|
||||
}
|
||||
|
||||
// No valid JSON found, return as is
|
||||
return content
|
||||
}
|
||||
Reference in New Issue
Block a user