#!/usr/bin/env python
"""
数据库完整性检查和去重脚本
检查数据库中的重复记录，包括球员、球队和国家
按照规则处理重复项：保留先找到的，删除后找到的，并更新所有外键引用
"""

import os
import django
import sys
from pathlib import Path
from difflib import SequenceMatcher

# 添加项目根目录到Python路径
project_root = Path(__file__).resolve().parent
sys.path.insert(0, str(project_root))

# 设置Django环境
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'api.project_config.settings')
django.setup()

from api.models import Country, Team, Player, PlayerStats, MatchResult, LeagueStanding


class DatabaseDeduplicator:
    """
    数据库去重器
    检查并处理数据库中的重复记录
    """
    
    def __init__(self, threshold=0.85):
        """
        初始化去重器
        
        Args:
            threshold (float): 名称相似度阈值，高于此值认为是重复记录
        """
        self.threshold = threshold
    
    def calculate_similarity(self, name1, name2):
        """
        计算两个名称之间的相似度
        
        Args:
            name1 (str): 第一个名称
            name2 (str): 第二个名称
            
        Returns:
            float: 相似度 (0-1)
        """
        return SequenceMatcher(None, name1.lower(), name2.lower()).ratio()
    
    def deduplicate_countries(self):
        """
        去重国家记录
        """
        print("检查并去重国家记录...")
        countries = list(Country.objects.all().order_by('id'))
        processed = set()
        merged_count = 0
        
        for i, country1 in enumerate(countries):
            if country1.id in processed:
                continue
                
            processed.add(country1.id)
            
            for j, country2 in enumerate(countries[i+1:], i+1):
                if country2.id in processed:
                    continue
                    
                # 检查名称或代码是否相似
                name_similarity = self.calculate_similarity(country1.name, country2.name)
                code_similarity = self.calculate_similarity(country1.code, country2.code)
                
                if name_similarity >= self.threshold or code_similarity >= self.threshold:
                    print(f"  发现重复国家记录: '{country1.name}' (ID: {country1.id}) 和 '{country2.name}' (ID: {country2.id})")
                    
                    # 更新引用这个国家的外键
                    teams_updated = Team.objects.filter(country=country2).update(country=country1)
                    match_results_updated = MatchResult.objects.filter(country=country2).update(country=country1)
                    league_standings_updated = LeagueStanding.objects.filter(country=country2).update(country=country1)
                    
                    print(f"    更新了 {teams_updated} 个球队引用")
                    print(f"    更新了 {match_results_updated} 个比赛结果引用")
                    print(f"    更新了 {league_standings_updated} 个联赛积分引用")
                    
                    # 删除第二个国家记录
                    country2.delete()
                    processed.add(country2.id)
                    merged_count += 1
                    print(f"    删除了国家记录: '{country2.name}' (ID: {country2.id})")
        
        print(f"国家记录去重完成，共合并 {merged_count} 条记录")
        return merged_count
    
    def deduplicate_teams(self):
        """
        去重球队记录
        """
        print("检查并去重球队记录...")
        teams = list(Team.objects.all().order_by('id'))
        processed = set()
        merged_count = 0
        
        for i, team1 in enumerate(teams):
            if team1.id in processed:
                continue
                
            processed.add(team1.id)
            
            for j, team2 in enumerate(teams[i+1:], i+1):
                if team2.id in processed:
                    continue
                    
                # 检查名称是否相似
                name_similarity = self.calculate_similarity(team1.name, team2.name)
                
                if name_similarity >= self.threshold:
                    print(f"  发现重复球队记录: '{team1.name}' (ID: {team1.id}) 和 '{team2.name}' (ID: {team2.id})")
                    
                    # 更新引用这个球队的外键
                    player_stats_updated = PlayerStats.objects.filter(team=team2).update(team=team1)
                    home_matches_updated = MatchResult.objects.filter(home_team=team2).update(home_team=team1)
                    away_matches_updated = MatchResult.objects.filter(away_team=team2).update(away_team=team1)
                    league_standings_updated = LeagueStanding.objects.filter(team=team2).update(team=team1)
                    
                    print(f"    更新了 {player_stats_updated} 个球员统计数据引用")
                    print(f"    更新了 {home_matches_updated} 个主场比赛引用")
                    print(f"    更新了 {away_matches_updated} 个客场比赛引用")
                    print(f"    更新了 {league_standings_updated} 个联赛积分引用")
                    
                    # 删除第二个球队记录
                    team2.delete()
                    processed.add(team2.id)
                    merged_count += 1
                    print(f"    删除了球队记录: '{team2.name}' (ID: {team2.id})")
        
        print(f"球队记录去重完成，共合并 {merged_count} 条记录")
        return merged_count
    
    def deduplicate_players(self):
        """
        去重球员记录
        """
        print("检查并去重球员记录...")
        players = list(Player.objects.all().order_by('id'))
        processed = set()
        merged_count = 0
        
        for i, player1 in enumerate(players):
            if player1.id in processed:
                continue
                
            processed.add(player1.id)
            
            for j, player2 in enumerate(players[i+1:], i+1):
                if player2.id in processed:
                    continue
                    
                # 检查名称是否相似
                name_similarity = self.calculate_similarity(player1.name, player2.name)
                
                if name_similarity >= self.threshold:
                    print(f"  发现重复球员记录: '{player1.name}' (ID: {player1.id}) 和 '{player2.name}' (ID: {player2.id})")
                    
                    # 更新引用这个球员的外键
                    player_stats_updated = PlayerStats.objects.filter(player=player2).update(player=player1)
                    
                    print(f"    更新了 {player_stats_updated} 个球员统计数据引用")
                    
                    # 删除第二个球员记录
                    player2.delete()
                    processed.add(player2.id)
                    merged_count += 1
                    print(f"    删除了球员记录: '{player2.name}' (ID: {player2.id})")
        
        print(f"球员记录去重完成，共合并 {merged_count} 条记录")
        return merged_count
    
    def check_database_integrity(self):
        """
        检查数据库完整性
        """
        print("检查数据库完整性...")
        
        # 检查是否有孤立的记录
        orphaned_player_stats = PlayerStats.objects.filter(player__isnull=True).count()
        orphaned_player_stats += PlayerStats.objects.filter(team__isnull=True).count()
        
        orphaned_match_results = MatchResult.objects.filter(country__isnull=True).count()
        orphaned_match_results += MatchResult.objects.filter(home_team__isnull=True).count()
        orphaned_match_results += MatchResult.objects.filter(away_team__isnull=True).count()
        
        orphaned_league_standings = LeagueStanding.objects.filter(team__isnull=True).count()
        orphaned_league_standings += LeagueStanding.objects.filter(country__isnull=True).count()
        
        print(f"  孤立的球员统计数据记录: {orphaned_player_stats}")
        print(f"  孤立的比赛结果记录: {orphaned_match_results}")
        print(f"  孤立的联赛积分记录: {orphaned_league_standings}")
        
        if orphaned_player_stats + orphaned_match_results + orphaned_league_standings == 0:
            print("  ✓ 数据库完整性检查通过")
        else:
            print("  ✗ 发现孤立记录，请检查数据库完整性")
    
    def run_full_deduplication(self):
        """
        运行完整的去重流程
        """
        print("开始数据库去重和完整性检查...")
        print("=" * 60)
        
        # 按顺序进行去重
        countries_merged = self.deduplicate_countries()
        teams_merged = self.deduplicate_teams()
        players_merged = self.deduplicate_players()
        
        # 检查数据库完整性
        self.check_database_integrity()
        
        print("=" * 60)
        print("数据库去重和完整性检查完成:")
        print(f"  合并国家记录: {countries_merged} 条")
        print(f"  合并球队记录: {teams_merged} 条")
        print(f"  合并球员记录: {players_merged} 条")
        print(f"  总计合并记录: {countries_merged + teams_merged + players_merged} 条")


def main():
    """
    主函数
    """
    deduplicator = DatabaseDeduplicator(threshold=0.85)
    
    if '--check-only' in sys.argv:
        # 只检查不执行去重
        print("检查数据库重复记录（仅检查模式）...")
        # 这里可以添加只检查的逻辑
        pass
    else:
        # 执行完整的去重流程
        deduplicator.run_full_deduplication()


if __name__ == "__main__":
    main()