s01e01/internal/usecase/categorize.go

package usecase

import (
	"context"
	"encoding/json"
	"fmt"
	"os"
	"path/filepath"
	"strings"
	"time"

	"github.com/paramah/ai_devs4/s01e01/internal/domain"
)

// CategorizeUseCase handles the categorization of people
type CategorizeUseCase struct {
	personRepo  domain.PersonRepository
	llmProvider domain.LLMProvider
	outputDir   string
	batchSize   int
}

// NewCategorizeUseCase creates a new categorize use case
func NewCategorizeUseCase(repo domain.PersonRepository, llm domain.LLMProvider, outputDir string, batchSize int) *CategorizeUseCase {
	return &CategorizeUseCase{
		personRepo:  repo,
		llmProvider: llm,
		outputDir:   outputDir,
		batchSize:   batchSize,
	}
}

// Execute fetches people and categorizes them using LLM
func (uc *CategorizeUseCase) Execute(ctx context.Context, dataURL string) ([]domain.Person, error) {
	// Create output directory if it doesn't exist
	if err := os.MkdirAll(uc.outputDir, 0755); err != nil {
		return nil, fmt.Errorf("creating output directory: %w", err)
	}

	// Fetch people from data source
	allPeople, err := uc.personRepo.FetchPeople(ctx, dataURL)
	if err != nil {
		return nil, fmt.Errorf("fetching people: %w", err)
	}

	originalCount := len(allPeople)
	fmt.Printf("\n[%s] ========== DATA FILTERING ==========\n", time.Now().Format("2006-01-02 15:04:05"))
	fmt.Printf("[%s] Original CSV entries: %d\n", time.Now().Format("2006-01-02 15:04:05"), originalCount)

	if originalCount == 0 {
		return []domain.Person{}, nil
	}

	fmt.Printf("[%s] Applying filters:\n", time.Now().Format("2006-01-02 15:04:05"))
	fmt.Printf("  - Gender: M (male)\n")
	fmt.Printf("  - Age in 2026: 20-40 years (born 1986-2006)\n")
	fmt.Printf("  - City: Grudziądz\n")
	fmt.Printf("  - Industry: ALL (will be categorized by LLM)\n\n")

	// Filter people - keep only those matching criteria
	people := uc.filterCompletePeople(allPeople)
	filteredCount := len(people)

	fmt.Printf("[%s] Filtered entries (matching criteria): %d\n", time.Now().Format("2006-01-02 15:04:05"), filteredCount)
	fmt.Printf("[%s] Removed entries: %d\n", time.Now().Format("2006-01-02 15:04:05"), originalCount-filteredCount)
	fmt.Printf("[%s] =====================================\n\n", time.Now().Format("2006-01-02 15:04:05"))

	// Save filtered data to file
	if err := uc.saveFilteredData(people); err != nil {
		return nil, fmt.Errorf("saving filtered data: %w", err)
	}

	if filteredCount == 0 {
		fmt.Printf("[%s] No complete entries to process\n", time.Now().Format("2006-01-02 15:04:05"))
		return []domain.Person{}, nil
	}

	// Process in batches
	totalPeople := len(people)
	totalBatches := (totalPeople + uc.batchSize - 1) / uc.batchSize
	startTime := time.Now()
	processedBatches := 0
	skippedBatches := 0

	for i := 0; i < totalPeople; i += uc.batchSize {
		batchNum := i/uc.batchSize + 1
		batchStart := time.Now()

		end := i + uc.batchSize
		if end > totalPeople {
			end = totalPeople
		}

		batch := people[i:end]

		// Generate filename for this batch
		batchFilename := fmt.Sprintf("batch_%d_%d.json", i, end-1)
		batchFilepath := filepath.Join(uc.outputDir, batchFilename)

		// Check if batch already processed
		if _, err := os.Stat(batchFilepath); err == nil {
			skippedBatches++
			fmt.Printf("[%s] Skipping batch %d/%d (entries %d-%d, already processed)\n",
				time.Now().Format("2006-01-02 15:04:05"),
				batchNum,
				totalBatches,
				i,
				end-1)
			continue
		}

		// Calculate ETA
		var etaStr string
		if processedBatches > 0 {
			elapsed := time.Since(startTime)
			avgTimePerBatch := elapsed / time.Duration(processedBatches)
			remainingBatches := totalBatches - batchNum
			eta := avgTimePerBatch * time.Duration(remainingBatches)
			etaStr = fmt.Sprintf(" (ETA: %s)", eta.Round(time.Second))
		}

		fmt.Printf("[%s] Processing batch %d/%d (entries %d-%d, %d people)...%s\n",
			time.Now().Format("2006-01-02 15:04:05"),
			batchNum,
			totalBatches,
			i,
			end-1,
			len(batch),
			etaStr)

		// Prepare prompt for LLM
		prompt, schema := uc.buildPrompt(batch)

		// Send to LLM for categorization
		response, err := uc.llmProvider.Complete(ctx, domain.LLMRequest{
			Prompt: prompt,
			Schema: schema,
		})
		if err != nil {
			return nil, fmt.Errorf("LLM completion (batch %d): %w", batchNum, err)
		}

		// Log the raw model response for debugging
		fmt.Printf("[%s] Raw response:\n%s\n",
			time.Now().Format("2006-01-02 15:04:05"),
			response.Content)

		// Parse the response
		categorizedBatch, err := uc.parseBatchResponse(response.Content)
		if err != nil {
			return nil, fmt.Errorf("parsing LLM response (batch %d): %w\nRaw response: %s", batchNum, err, response.Content)
		}

		// Save batch to file
		if err := uc.saveBatchToFile(categorizedBatch, batchFilepath); err != nil {
			return nil, fmt.Errorf("saving batch to file: %w", err)
		}

		processedBatches++
		batchDuration := time.Since(batchStart)
		fmt.Printf("[%s] Batch %d completed in %s (%d people)\n",
			time.Now().Format("2006-01-02 15:04:05"),
			batchNum,
			batchDuration.Round(time.Second),
			len(categorizedBatch))
	}

	fmt.Printf("\n[%s] Summary: Processed %d batches, Skipped %d (already done), Total %d batches\n",
		time.Now().Format("2006-01-02 15:04:05"),
		processedBatches,
		skippedBatches,
		totalBatches)

	// Collect all results from files
	return uc.collectResults()
}

// saveBatchToFile saves a batch of people to a JSON file
func (uc *CategorizeUseCase) saveBatchToFile(people []domain.Person, filepath string) error {
	data, err := json.MarshalIndent(people, "", "  ")
	if err != nil {
		return fmt.Errorf("marshaling batch: %w", err)
	}

	if err := os.WriteFile(filepath, data, 0644); err != nil {
		return fmt.Errorf("writing file: %w", err)
	}

	return nil
}

// filterCompletePeople filters people based on specific criteria
func (uc *CategorizeUseCase) filterCompletePeople(people []domain.Person) []domain.Person {
	const currentYear = 2026
	const minAge = 20
	const maxAge = 40

	// Calculate birth year range
	minBornYear := currentYear - maxAge // 1986
	maxBornYear := currentYear - minAge // 2006

	var filtered []domain.Person
	for _, person := range people {
		// Basic data completeness check
		if person.Gender == "" || person.Born == 0 || person.City == "" || person.Job == "" {
			continue
		}

		// Apply specific filters:
		// 1. Gender: Male (M)
		if person.Gender != "M" {
			continue
		}

		// 2. Age: between 20-40 years in 2026 (born between 1986-2006)
		if person.Born < minBornYear || person.Born > maxBornYear {
			continue
		}

		// 3. City: Grudziądz
		if person.City != "Grudziądz" {
			continue
		}

		// 4. Industry: transport-related (DISABLED - LLM will categorize)
		// if !uc.isTransportJob(person.Job) {
		// 	continue
		// }

		filtered = append(filtered, person)
	}
	return filtered
}

// isTransportJob checks if job description is related to transport industry
func (uc *CategorizeUseCase) isTransportJob(jobDescription string) bool {
	jobLower := strings.ToLower(jobDescription)

	transportKeywords := []string{
		"transport",
		"pojazd",
		"samochód",
		"kierow",
		"prowadz", // prowadzenie pojazdu
		"dostaw",
		"przewóz",
		"logistyk",
		"auto",
		"ciężar", // ciężarówka
		"bus",
		"tir",
		"wagon",
		"pojazd",
		"ruch",
		"droga",
		"trasa",
		"przesył",
	}

	for _, keyword := range transportKeywords {
		if strings.Contains(jobLower, keyword) {
			return true
		}
	}

	return false
}

// saveFilteredData saves filtered people data to a JSON file
func (uc *CategorizeUseCase) saveFilteredData(people []domain.Person) error {
	// Create a version without Job field for saving
	type PersonForSave struct {
		Name    string `json:"name"`
		Surname string `json:"surname"`
		Gender  string `json:"gender"`
		Born    int    `json:"born"`
		City    string `json:"city"`
		Job     string `json:"job"`
	}

	peopleForSave := make([]PersonForSave, len(people))
	for i, p := range people {
		peopleForSave[i] = PersonForSave{
			Name:    p.Name,
			Surname: p.Surname,
			Gender:  p.Gender,
			Born:    p.Born,
			City:    p.City,
			Job:     p.Job,
		}
	}

	data, err := json.MarshalIndent(peopleForSave, "", "  ")
	if err != nil {
		return fmt.Errorf("marshaling filtered data: %w", err)
	}

	filePath := filepath.Join(uc.outputDir, "filtered_people.json")
	if err := os.WriteFile(filePath, data, 0644); err != nil {
		return fmt.Errorf("writing filtered data file: %w", err)
	}

	fmt.Printf("[%s] Filtered data saved to: %s\n\n", time.Now().Format("2006-01-02 15:04:05"), filePath)
	return nil
}

// collectResults reads all batch files and returns them as a slice
func (uc *CategorizeUseCase) collectResults() ([]domain.Person, error) {
	files, err := os.ReadDir(uc.outputDir)
	if err != nil {
		return nil, fmt.Errorf("reading output directory: %w", err)
	}

	var allPeople []domain.Person
	for _, file := range files {
		// Skip filtered_people.json and only process batch files
		if file.IsDir() || filepath.Ext(file.Name()) != ".json" || file.Name() == "filtered_people.json" {
			continue
		}

		data, err := os.ReadFile(filepath.Join(uc.outputDir, file.Name()))
		if err != nil {
			return nil, fmt.Errorf("reading file %s: %w", file.Name(), err)
		}

		var batch []domain.Person
		if err := json.Unmarshal(data, &batch); err != nil {
			return nil, fmt.Errorf("unmarshaling file %s: %w", file.Name(), err)
		}

		allPeople = append(allPeople, batch...)
	}

	return allPeople, nil
}

func (uc *CategorizeUseCase) buildPrompt(batch []domain.Person) (string, interface{}) {
	// Create a version with job descriptions for the prompt
	type PersonWithJob struct {
		Name    string `json:"name"`
		Surname string `json:"surname"`
		Gender  string `json:"gender"`
		Born    int    `json:"born"`
		City    string `json:"city"`
		Job     string `json:"job"`
	}

	batchWithJobs := make([]PersonWithJob, len(batch))
	for i, p := range batch {
		batchWithJobs[i] = PersonWithJob{
			Name:    p.Name,
			Surname: p.Surname,
			Gender:  p.Gender,
			Born:    p.Born,
			City:    p.City,
			Job:     p.Job,
		}
	}

	batchJSON, _ := json.Marshal(batchWithJobs)

	availableTags := domain.AvailableTags()
	tagsJSON, _ := json.Marshal(availableTags)

	prompt := fmt.Sprintf(`Categorize the following people based on their job descriptions.
Each person should be assigned one or more appropriate tags from the available list based on their job field.

Available tags: %s

People to categorize (each person has name, surname, gender, born year, city, and job description):
%s

CRITICAL INSTRUCTIONS:
1. YOU MUST PROCESS ALL %d PEOPLE IN THE INPUT - NOT JUST A FEW!
2. Read EVERY person's job description carefully
3. Assign 1-3 relevant tags from the available list based on what the job description says
4. Tag mapping guidelines:
   - "IT" - for programming, software development, algorithms, data structures, technology
   - "transport" - for driving, vehicle operation, logistics
   - "edukacja" - for teaching, training, education, development of skills
   - "medycyna" - for healthcare, doctors, nurses, medical diagnosis, treatment
   - "praca z ludźmi" - for jobs involving direct work with people (teaching, healthcare, consulting, etc.)
   - "praca z pojazdami" - for mechanics, vehicle repair, automotive work
   - "praca fizyczna" - for manual labor, construction, carpentry, physical work
5. Many jobs can have multiple tags (e.g., a teacher = "edukacja" + "praca z ludźmi")
6. Return a complete JSON array with ALL %d people
7. Each person object must have: name, surname, gender, born, city, tags
8. Do NOT include the job description in the output
9. No explanations, no markdown formatting, just the JSON array`, string(tagsJSON), string(batchJSON), len(batch), len(batch))

	// JSON Schema for structured output
	schema := map[string]interface{}{
		"name": "categorize_people",
		"schema": map[string]interface{}{
			"type": "array",
			"items": map[string]interface{}{
				"type": "object",
				"properties": map[string]interface{}{
					"name": map[string]interface{}{
						"type": "string",
					},
					"surname": map[string]interface{}{
						"type": "string",
					},
					"gender": map[string]interface{}{
						"type": "string",
					},
					"born": map[string]interface{}{
						"type": "integer",
					},
					"city": map[string]interface{}{
						"type": "string",
					},
					"tags": map[string]interface{}{
						"type": "array",
						"items": map[string]interface{}{
							"type": "string",
						},
					},
				},
				"required": []string{"name", "surname", "gender", "born", "city", "tags"},
			},
		},
	}

	return prompt, schema
}

func (uc *CategorizeUseCase) parseBatchResponse(content string) ([]domain.Person, error) {
	// Clean up the response - extract JSON
	cleanContent := uc.cleanJSONResponse(content)

	var people []domain.Person
	if err := json.Unmarshal([]byte(cleanContent), &people); err != nil {
		return nil, fmt.Errorf("unmarshaling response: %w", err)
	}

	return people, nil
}

// cleanJSONResponse extracts JSON from response that may contain extra text
func (uc *CategorizeUseCase) cleanJSONResponse(content string) string {
	// Trim whitespace
	content = strings.TrimSpace(content)

	// Try to find JSON array first [...]
	arrayStart := strings.Index(content, "[")
	arrayEnd := strings.LastIndex(content, "]")

	// Try to find JSON object {...}
	objectStart := strings.Index(content, "{")
	objectEnd := strings.LastIndex(content, "}")

	// Use whichever comes first
	if arrayStart != -1 && (objectStart == -1 || arrayStart < objectStart) {
		if arrayEnd != -1 && arrayEnd > arrayStart {
			return strings.TrimSpace(content[arrayStart : arrayEnd+1])
		}
	}

	if objectStart != -1 && objectEnd != -1 && objectStart < objectEnd {
		return strings.TrimSpace(content[objectStart : objectEnd+1])
	}

	// No valid JSON found, return as is
	return content
}