#!/usr/bin/env python
"""
球员记录去重工具
使用模糊匹配算法识别和合并由于拼写错误等原因造成的重复球员记录
"""

import os
import django
import sys
from pathlib import Path
from difflib import SequenceMatcher

# 添加项目根目录到Python路径
project_root = Path(__file__).resolve().parent
sys.path.insert(0, str(project_root))

# 设置Django环境
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'api.project_config.settings')
django.setup()

from api.models import Player, PlayerStats


class PlayerDeduplicator:
    """
    球员记录去重器
    使用模糊匹配算法识别和处理重复的球员记录
    """
    
    def __init__(self, threshold=0.85):
        """
        初始化去重器
        
        Args:
            threshold (float): 名称相似度阈值，高于此值认为是重复记录
        """
        self.threshold = threshold
    
    def calculate_similarity(self, name1, name2):
        """
        计算两个名称之间的相似度
        
        Args:
            name1 (str): 第一个名称
            name2 (str): 第二个名称
            
        Returns:
            float: 相似度 (0-1)
        """
        return SequenceMatcher(None, name1.lower(), name2.lower()).ratio()
    
    def find_duplicate_candidates(self):
        """
        查找可能的重复记录
        
        Returns:
            list: 重复记录组的列表
        """
        players = list(Player.objects.all().order_by('name'))
        duplicate_groups = []
        processed = set()
        
        for i, player1 in enumerate(players):
            if player1.id in processed:
                continue
                
            group = [player1]
            processed.add(player1.id)
            
            for j, player2 in enumerate(players[i+1:], i+1):
                if player2.id in processed:
                    continue
                    
                similarity = self.calculate_similarity(player1.name, player2.name)
                if similarity >= self.threshold:
                    group.append(player2)
                    processed.add(player2.id)
            
            if len(group) > 1:
                duplicate_groups.append(group)
        
        return duplicate_groups
    
    def display_duplicate_candidates(self):
        """
        显示可能的重复记录
        """
        duplicates = self.find_duplicate_candidates()
        
        if not duplicates:
            print("未发现可能的重复记录")
            return
        
        print(f"发现 {len(duplicates)} 组可能的重复记录:")
        print("=" * 60)
        
        for i, group in enumerate(duplicates, 1):
            print(f"\n第 {i} 组重复记录:")
            for player in group:
                stats_count = player.stats.count()
                print(f"  ID: {player.id:3d} | 姓名: {player.name:<30} | 统计数据: {stats_count} 条")
    
    def merge_duplicate_players(self, player_ids, target_id=None):
        """
        合并重复的球员记录
        
        Args:
            player_ids (list): 要合并的球员ID列表
            target_id (int): 目标球员ID，如果为None则选择第一个
            
        Returns:
            Player: 合并后的球员记录
        """
        if len(player_ids) < 2:
            raise ValueError("至少需要2个球员记录才能合并")
        
        players = list(Player.objects.filter(id__in=player_ids).order_by('id'))
        if len(players) != len(player_ids):
            raise ValueError("某些球员ID不存在")
        
        # 确定目标球员
        if target_id:
            target_player = next((p for p in players if p.id == target_id), None)
            if not target_player:
                raise ValueError(f"目标球员ID {target_id} 不在合并列表中")
        else:
            # 选择第一个作为目标（通常是ID最小的）
            target_player = players[0]
        
        # 合并统计数据到目标球员
        source_players = [p for p in players if p.id != target_player.id]
        
        print(f"合并球员记录到 '{target_player.name}' (ID: {target_player.id}):")
        
        for source_player in source_players:
            # 更新统计数据的球员引用
            stats_updated = PlayerStats.objects.filter(player=source_player).update(player=target_player)
            print(f"  从 '{source_player.name}' (ID: {source_player.id}) 转移了 {stats_updated} 条统计数据")
            
            # 删除源球员记录
            source_player.delete()
            print(f"  删除了球员记录 '{source_player.name}' (ID: {source_player.id})")
        
        return target_player
    
    def auto_merge_all_duplicates(self, auto_confirm=False):
        """
        自动合并所有检测到的重复记录
        
        Args:
            auto_confirm (bool): 是否自动确认合并，否则需要用户确认
        """
        duplicates = self.find_duplicate_candidates()
        
        if not duplicates:
            print("未发现可能的重复记录")
            return
        
        print(f"发现 {len(duplicates)} 组可能的重复记录，准备自动合并:")
        print("=" * 60)
        
        for i, group in enumerate(duplicates, 1):
            print(f"\n第 {i} 组:")
            player_ids = [p.id for p in group]
            
            # 显示组内详情
            for player in group:
                stats_count = player.stats.count()
                print(f"  ID: {player.id:3d} | 姓名: {player.name:<30} | 统计数据: {stats_count} 条")
            
            # 选择目标球员（第一个）
            target_player = group[0]
            print(f"  -> 将合并到: {target_player.name} (ID: {target_player.id})")
            
            if not auto_confirm:
                confirm = input(f"\n确认合并第 {i} 组记录? (y/N): ")
                if confirm.lower() != 'y':
                    print("  跳过此组")
                    continue
            
            try:
                self.merge_duplicate_players(player_ids, target_player.id)
                print(f"  ✓ 第 {i} 组合并完成")
            except Exception as e:
                print(f"  ✗ 合并失败: {e}")
        
        print("\n所有重复记录处理完成")


def main():
    """
    主函数
    """
    if len(sys.argv) > 1 and sys.argv[1] == '--merge':
        # 执行自动合并
        deduplicator = PlayerDeduplicator(threshold=0.85)
        deduplicator.auto_merge_all_duplicates(auto_confirm='--auto' in sys.argv)
    else:
        # 仅显示重复记录
        deduplicator = PlayerDeduplicator(threshold=0.85)
        deduplicator.display_duplicate_candidates()


if __name__ == "__main__":
    main()