import os
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from typing import List, Tuple
import logging
from .base_analyzer import BaseAnalyzer

logger = logging.getLogger(__name__)

class NERAnalyzer(BaseAnalyzer):
    def __init__(self):
        self.model_name = "neuralmind/bert-base-portuguese-cased"  # Modelo NER público para português
        logger.info(f"Carregando o modelo NER: {self.model_name}")
        
        # Carregando o modelo e tokenizer
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        
        # Definindo as labels que queremos extrair (pessoas e organizações)
        self.target_labels = ['B-PER', 'I-PER', 'B-ORG', 'I-ORG']
        
        logger.info("Modelo NER e tokenizador carregados com sucesso")

    def extract_entities(self, text: str) -> List[Tuple[str, str]]:
        logger.debug("Iniciando extração de entidades com NER")
        
        # Pré-processamento do texto
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            return_tensors="pt",
            padding=True
        )
        
        # Obtendo tokens
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        
        # Fazendo a predição
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=2)
        
        entities = []
        for token, prediction in zip(tokens, predictions[0].numpy()):
            entity_label = self.model.config.id2label[prediction]
            
            # Filtrando apenas pessoas e organizações
            if entity_label in self.target_labels:
                # Removendo prefixos especiais do tokenizer
                if token.startswith("##"):
                    token = token[2:]
                # Ignorando tokens especiais
                if token not in ["[CLS]", "[SEP]", "[PAD]"]:
                    entities.append((token, entity_label))
        
        logger.info(f"Entidades extraídas: {entities}")
        return entities

    def extract_representatives(self, entities: List[Tuple[str, str]]) -> List[str]:
        if not entities:
            return []
        
        representatives = []
        current_entity = []
        current_label = None
        
        for token, label in entities:
            # Verificando se é continuação da mesma entidade
            is_same_entity = (
                (label.startswith('B-') and current_label and current_label.endswith(label[2:])) or
                (label.startswith('I-') and current_label and current_label.endswith(label[2:]))
            )
            
            if is_same_entity:
                current_entity.append(token)
            else:
                if current_entity:
                    representatives.append("".join(current_entity).replace(" ##", ""))
                current_entity = [token]
                current_label = label
        
        # Adicionando a última entidade
        if current_entity:
            representatives.append("".join(current_entity).replace(" ##", ""))
        
        # Removendo duplicatas e limpando
        representatives = list(set(representatives))
        representatives = [rep.strip() for rep in representatives if len(rep.strip()) > 1]
        
        logger.info(f"Representantes extraídos: {representatives}")
        return representatives

    def analyze(self, text: str) -> List[str]:
        entities = self.extract_entities(text)
        return self.extract_representatives(entities)

    def format_output(self, representatives: List[str]) -> str:
        output = "ANÁLISE DO CONTRATO SOCIAL (NER)\n\n"
        
        if not representatives:
            output += "Nenhum representante ou empresa identificado.\n"
            return output
        
        output += "REPRESENTANTES E EMPRESAS IDENTIFICADOS:\n"
        for rep in representatives:
            output += f"- {rep}\n"
        
        return output