{ "dataset": { "name": "George-API/cognitive-data", "split": "train", "column_mapping": { "conversations": "text" }, "processing": { "sort_by_id": true, "maintain_paper_order": true, "max_seq_length": 2048 } }, "data_formatting": { "chat_template": "phi", "roles": { "system": "System: {content}\n\n", "human": "Human: {content}\n\n", "assistant": "Assistant: {content}\n\n", "user": "Human: {content}\n\n" }, "metadata_handling": { "include_paper_id": true, "include_chunk_number": true, "metadata_format": "Paper ID: {paper_id} | Chunk: {chunk_number}" } }, "data_loading": { "batch_size": 24, "shuffle": false, "drop_last": false, "num_workers": 4, "pin_memory": true, "prefetch_factor": 4 }, "validation": { "log_samples": 3, "log_interval": 50, "metrics": ["processed", "skipped", "avg_tokens", "unique_papers"] } }