import os from transformers import AutoModelForTokenClassification, AutoTokenizer import torch from typing import List, Tuple import logging from .base_analyzer import BaseAnalyzer logger = logging.getLogger(__name__) class NERAnalyzer(BaseAnalyzer): def __init__(self): self.model_name = "neuralmind/bert-base-portuguese-cased" # Modelo NER público para português logger.info(f"Carregando o modelo NER: {self.model_name}") # Carregando o modelo e tokenizer self.model = AutoModelForTokenClassification.from_pretrained(self.model_name) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) # Definindo as labels que queremos extrair (pessoas e organizações) self.target_labels = ['B-PER', 'I-PER', 'B-ORG', 'I-ORG'] logger.info("Modelo NER e tokenizador carregados com sucesso") def extract_entities(self, text: str) -> List[Tuple[str, str]]: logger.debug("Iniciando extração de entidades com NER") # Pré-processamento do texto inputs = self.tokenizer( text, max_length=512, truncation=True, return_tensors="pt", padding=True ) # Obtendo tokens tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) # Fazendo a predição with torch.no_grad(): outputs = self.model(**inputs) predictions = torch.argmax(outputs.logits, dim=2) entities = [] for token, prediction in zip(tokens, predictions[0].numpy()): entity_label = self.model.config.id2label[prediction] # Filtrando apenas pessoas e organizações if entity_label in self.target_labels: # Removendo prefixos especiais do tokenizer if token.startswith("##"): token = token[2:] # Ignorando tokens especiais if token not in ["[CLS]", "[SEP]", "[PAD]"]: entities.append((token, entity_label)) logger.info(f"Entidades extraídas: {entities}") return entities def extract_representatives(self, entities: List[Tuple[str, str]]) -> List[str]: if not entities: return [] representatives = [] current_entity = [] current_label = None for token, label in entities: # Verificando se é continuação da mesma entidade is_same_entity = ( (label.startswith('B-') and current_label and current_label.endswith(label[2:])) or (label.startswith('I-') and current_label and current_label.endswith(label[2:])) ) if is_same_entity: current_entity.append(token) else: if current_entity: representatives.append("".join(current_entity).replace(" ##", "")) current_entity = [token] current_label = label # Adicionando a última entidade if current_entity: representatives.append("".join(current_entity).replace(" ##", "")) # Removendo duplicatas e limpando representatives = list(set(representatives)) representatives = [rep.strip() for rep in representatives if len(rep.strip()) > 1] logger.info(f"Representantes extraídos: {representatives}") return representatives def analyze(self, text: str) -> List[str]: entities = self.extract_entities(text) return self.extract_representatives(entities) def format_output(self, representatives: List[str]) -> str: output = "ANÁLISE DO CONTRATO SOCIAL (NER)\n\n" if not representatives: output += "Nenhum representante ou empresa identificado.\n" return output output += "REPRESENTANTES E EMPRESAS IDENTIFICADOS:\n" for rep in representatives: output += f"- {rep}\n" return output