In [14]:
import os
import xml.etree.ElementTree as ET
from zipfile import ZipFile
from typing import Dict, Any, Set
from collections import Counter
import logging
import json

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class BugSeverityProcessor:
    # Comprehensive severity mappings
    SEVERITY_MAPPINGS = {
        # Mozilla (Firefox/Thunderbird) severities
        'blocker': 'Severe',
        'critical': 'Severe',
        'major': 'Severe',
        'normal': 'Non-Severe',
        'minor': 'Non-Severe',
        'trivial': 'Non-Severe',
        'enhancement': 'Non-Severe',

        # Eclipse (JDT/PDE/Platform) severities
        'blocking': 'Severe',
        'critical_blocking': 'Severe',
        'major_blocking': 'Severe',
        'normal_blocking': 'Non-Severe',
        'minor_blocking': 'Non-Severe',
        'trivial_blocking': 'Non-Severe',

        # Bugzilla specific severities
        's1': 'Severe',
        's2': 'Severe',
        's3': 'Non-Severe',
        's4': 'Non-Severe',
        's5': 'Non-Severe',

        # Core/CDT specific severities
        'crash': 'Severe',
        'severe': 'Severe',
        'important': 'Severe',
        'medium': 'Non-Severe',
        'low': 'Non-Severe'
    }

    def __init__(self, base_dir: str):
        self.base_dir = base_dir
        self.projects = ['Bugzilla', 'CDT', 'Core', 'Firefox', 'JDT', 'PDE', 'Platform', 'Thunderbird']
        self.all_data: Dict[str, Dict[str, Any]] = {}
        self.raw_labels: Dict[str, Set[str]] = {project: set() for project in self.projects}
        self.label_stats: Dict[str, Counter] = {project: Counter() for project in self.projects}

    def convert_severity(self, severity: str) -> str:
        """
        Convert severity labels to binary classification while preserving original label.

        Args:
            severity: The original severity label

        Returns:
            str: 'Severe' or 'Non-Severe'
        """
        severity = severity.lower().strip()
        return self.SEVERITY_MAPPINGS.get(severity, 'Non-Severe')

    def process_xml_file(self, file_path: str, report_type: str, project: str) -> Dict[str, Any]:
        """
        Process individual XML files.

        Args:
            file_path: Path to XML file
            report_type: Type of report ('desc' or 'severity')
            project: Project name for tracking raw labels

        Returns:
            Dict containing processed data
        """
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            data = {}

            for report in root.findall('.//report'):
                report_id = report.get('id')
                last_update = report.find('.//update[last()]')

                if last_update is not None:
                    if report_type == 'desc':
                        when_elem = last_update.find('when')
                        what_elem = last_update.find('what')

                        if when_elem is not None and what_elem is not None:
                            data[report_id] = {
                                'when': when_elem.text,
                                'what': what_elem.text
                            }

                    elif report_type == 'severity':
                        what_elem = last_update.find('what')
                        if what_elem is not None and what_elem.text:
                            raw_severity = what_elem.text.strip()
                            # Store raw label
                            self.raw_labels[project].add(raw_severity)
                            # Update label statistics
                            self.label_stats[project][raw_severity] += 1

                            data[report_id] = {
                                'severity_mapped': self.convert_severity(raw_severity),
                                'severity_raw': raw_severity
                            }

            return data
        except ET.ParseError as e:
            logging.error(f"Error parsing XML file {file_path}: {str(e)}")
            return {}
        except Exception as e:
            logging.error(f"Unexpected error processing {file_path}: {str(e)}")
            return {}

    def process_project(self, project: str) -> None:
        """
        Process all XML files for a given project.

        Args:
            project: Project name
        """
        project_folder = os.path.join(self.base_dir, project)
        if not os.path.exists(project_folder):
            logging.warning(f"Project folder not found: {project_folder}")
            return

        desc_path = os.path.join(project_folder, 'short_desc.xml')
        severity_path = os.path.join(project_folder, 'severity.xml')

        desc_data = self.process_xml_file(desc_path, 'desc', project)
        severity_data = self.process_xml_file(severity_path, 'severity', project)

        # Merge the data
        project_data = {}
        for report_id in desc_data:
            if report_id in severity_data:
                project_data[report_id] = {
                    **desc_data[report_id],
                    **severity_data[report_id]
                }

        self.all_data[project] = project_data

    def process_all_projects(self) -> None:
        """Process all projects and analyze labels."""
        for project in self.projects:
            logging.info(f"Processing project: {project}")
            self.process_project(project)

    def generate_label_report(self) -> None:
        """Generate and print a comprehensive report of severity labels."""
        print("\n=== Raw Severity Labels Analysis ===\n")

        # All unique labels across all projects
        all_labels = set()
        for labels in self.raw_labels.values():
            all_labels.update(labels)

        print(f"Total unique severity labels found: {len(all_labels)}")
        print("\nAll unique severity labels:")
        for label in sorted(all_labels):
            mapped_value = self.SEVERITY_MAPPINGS.get(label.lower(), "Non-Severe")
            print(f"- {label:<20} -> {mapped_value}")

        print("\nLabel distribution by project:")
        for project in self.projects:
            if self.label_stats[project]:
                print(f"\n{project}:")
                total = sum(self.label_stats[project].values())
                for label, count in self.label_stats[project].most_common():
                    percentage = (count / total) * 100
                    mapped_value = self.SEVERITY_MAPPINGS.get(label.lower(), "Non-Severe")
                    print(f"  - {label:<20} : {count:>5} ({percentage:>6.2f}%) -> {mapped_value}")

def main():
    # Extract ZIP file
    zip_file_path = '/content/test1.zip'
    destination_directory = '/content/unzipped_data/'

    try:
        os.makedirs(destination_directory, exist_ok=True)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_directory)
        logging.info(f"Successfully extracted ZIP file to {destination_directory}")
    except Exception as e:
        logging.error(f"Error extracting ZIP file: {str(e)}")
        return

    # Process and analyze the bug reports
    processor = BugSeverityProcessor(destination_directory)
    processor.process_all_projects()
    processor.generate_label_report()

    # Print binary classification statistics
    print("\n=== Binary Classification Statistics ===\n")
    for project, data in processor.all_data.items():
        severe_count = sum(1 for report in data.values()
                          if report.get('severity_mapped') == 'Severe')
        total_count = len(data)
        if total_count > 0:
            severe_percentage = (severe_count / total_count) * 100
            print(f"{project}: {severe_count}/{total_count} "
                  f"({severe_percentage:.1f}%) classified as Severe")

if __name__ == "__main__":
    main()


=== Raw Severity Labels Analysis ===

Total unique severity labels found: 6

All unique severity labels:
- blocker              -> Severe
- critical             -> Severe
- major                -> Severe
- minor                -> Non-Severe
- normal               -> Non-Severe
- trivial              -> Non-Severe

Label distribution by project:

Bugzilla:
  - normal               :  2478 ( 53.68%) -> Non-Severe
  - minor                :   766 ( 16.59%) -> Non-Severe
  - major                :   506 ( 10.96%) -> Severe
  - trivial              :   415 (  8.99%) -> Non-Severe
  - blocker              :   275 (  5.96%) -> Severe
  - critical             :   176 (  3.81%) -> Severe

CDT:
  - normal               :  4547 ( 80.62%) -> Non-Severe
  - major                :   490 (  8.69%) -> Severe
  - minor                :   275 (  4.88%) -> Non-Severe
  - critical             :   166 (  2.94%) -> Severe
  - trivial              :    84 (  1.49%) -> Non-Severe
  - blocker              :  

In [15]:
import os
import xml.etree.ElementTree as ET
from zipfile import ZipFile
from typing import Dict, Any, Set
from collections import Counter
import logging
import json

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class BugSeverityProcessor:
    # Comprehensive severity mappings
    SEVERITY_MAPPINGS = {
        # Mozilla (Firefox/Thunderbird) severities
        'blocker': 'Severe',
        'critical': 'Severe',
        'major': 'Severe',
        'normal': 'Non-Severe',
        'minor': 'Non-Severe',
        'trivial': 'Non-Severe',
        'enhancement': 'Non-Severe',

        # Eclipse (JDT/PDE/Platform) severities
        'blocking': 'Severe',
        'critical_blocking': 'Severe',
        'major_blocking': 'Severe',
        'normal_blocking': 'Non-Severe',
        'minor_blocking': 'Non-Severe',
        'trivial_blocking': 'Non-Severe',

        # Bugzilla specific severities
        's1': 'Severe',
        's2': 'Severe',
        's3': 'Non-Severe',
        's4': 'Non-Severe',
        's5': 'Non-Severe',

        # Core/CDT specific severities
        'crash': 'Severe',
        'severe': 'Severe',
        'important': 'Severe',
        'medium': 'Non-Severe',
        'low': 'Non-Severe'
    }

    def __init__(self, base_dir: str):
        self.base_dir = base_dir
        self.projects = ['Bugzilla', 'CDT', 'Core', 'Firefox', 'JDT', 'PDE', 'Platform', 'Thunderbird']
        self.all_data: Dict[str, Dict[str, Any]] = {}
        self.raw_labels: Dict[str, Set[str]] = {project: set() for project in self.projects}
        self.label_stats: Dict[str, Counter] = {project: Counter() for project in self.projects}
        self.fixed_bugs: Dict[str, Set[str]] = {project: set() for project in self.projects}

    def convert_severity(self, severity: str) -> str:
        """
        Convert severity labels to binary classification while preserving original label.

        Args:
            severity: The original severity label

        Returns:
            str: 'Severe' or 'Non-Severe'
        """
        severity = severity.lower().strip()
        return self.SEVERITY_MAPPINGS.get(severity, 'Non-Severe')

    def process_xml_file(self, file_path: str, report_type: str, project: str) -> Dict[str, Any]:
        """
        Process individual XML files.

        Args:
            file_path: Path to XML file
            report_type: Type of report ('desc', 'severity', or 'resolution')
            project: Project name for tracking raw labels

        Returns:
            Dict containing processed data
        """
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            data = {}

            for report in root.findall('.//report'):
                report_id = report.get('id')
                last_update = report.find('.//update[last()]')

                if last_update is not None:
                    if report_type == 'desc':
                        when_elem = last_update.find('when')
                        what_elem = last_update.find('what')

                        if when_elem is not None and what_elem is not None:
                            data[report_id] = {
                                'when': when_elem.text,
                                'what': what_elem.text
                            }

                    elif report_type == 'severity':
                        what_elem = last_update.find('what')
                        if what_elem is not None and what_elem.text:
                            raw_severity = what_elem.text.strip()
                            self.raw_labels[project].add(raw_severity)
                            self.label_stats[project][raw_severity] += 1

                            data[report_id] = {
                                'severity_mapped': self.convert_severity(raw_severity),
                                'severity_raw': raw_severity
                            }

                    elif report_type == 'resolution':
                        what_elem = last_update.find('what')
                        if what_elem is not None and what_elem.text:
                            resolution = what_elem.text.strip().upper()
                            if resolution == 'FIXED':
                                self.fixed_bugs[project].add(report_id)
                                data[report_id] = {'resolution': 'FIXED'}

            return data
        except ET.ParseError as e:
            logging.error(f"Error parsing XML file {file_path}: {str(e)}")
            return {}
        except Exception as e:
            logging.error(f"Unexpected error processing {file_path}: {str(e)}")
            return {}

    def process_project(self, project: str) -> None:
        """
        Process all XML files for a given project.

        Args:
            project: Project name
        """
        project_folder = os.path.join(self.base_dir, project)
        if not os.path.exists(project_folder):
            logging.warning(f"Project folder not found: {project_folder}")
            return

        desc_path = os.path.join(project_folder, 'short_desc.xml')
        severity_path = os.path.join(project_folder, 'severity.xml')
        resolution_path = os.path.join(project_folder, 'resolution.xml')

        desc_data = self.process_xml_file(desc_path, 'desc', project)
        severity_data = self.process_xml_file(severity_path, 'severity', project)
        resolution_data = self.process_xml_file(resolution_path, 'resolution', project)

        # Merge the data only for FIXED bugs
        project_data = {}
        for report_id in self.fixed_bugs[project]:
            if report_id in desc_data and report_id in severity_data:
                project_data[report_id] = {
                    **desc_data[report_id],
                    **severity_data[report_id],
                    **resolution_data[report_id]
                }

        self.all_data[project] = project_data

    def process_all_projects(self) -> None:
        """Process all projects and analyze labels."""
        for project in self.projects:
            logging.info(f"Processing project: {project}")
            self.process_project(project)

    def generate_label_report(self) -> None:
        """Generate and print a comprehensive report of severity labels for FIXED bugs."""
        print("\n=== Raw Severity Labels Analysis (FIXED Bugs Only) ===\n")

        # All unique labels across all projects
        all_labels = set()
        for labels in self.raw_labels.values():
            all_labels.update(labels)

        print(f"Total unique severity labels found: {len(all_labels)}")
        print("\nAll unique severity labels:")
        for label in sorted(all_labels):
            mapped_value = self.SEVERITY_MAPPINGS.get(label.lower(), "Non-Severe")
            print(f"- {label:<20} -> {mapped_value}")

        print("\nLabel distribution by project (FIXED bugs only):")
        for project in self.projects:
            if self.label_stats[project]:
                print(f"\n{project}:")
                fixed_bugs_count = len(self.all_data[project])
                if fixed_bugs_count > 0:
                    severity_counter = Counter()
                    for bug_data in self.all_data[project].values():
                        severity_counter[bug_data['severity_raw']] += 1

                    for label, count in severity_counter.most_common():
                        percentage = (count / fixed_bugs_count) * 100
                        mapped_value = self.SEVERITY_MAPPINGS.get(label.lower(), "Non-Severe")
                        print(f"  - {label:<20} : {count:>5} ({percentage:>6.2f}%) -> {mapped_value}")

def main():
    # Extract ZIP file
    zip_file_path = '/content/test1.zip'
    destination_directory = '/content/unzipped_data/'

    try:
        os.makedirs(destination_directory, exist_ok=True)
        with ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_directory)
        logging.info(f"Successfully extracted ZIP file to {destination_directory}")
    except Exception as e:
        logging.error(f"Error extracting ZIP file: {str(e)}")
        return

    # Process and analyze the bug reports
    processor = BugSeverityProcessor(destination_directory)
    processor.process_all_projects()
    processor.generate_label_report()

    # Print binary classification statistics for FIXED bugs
    print("\n=== Binary Classification Statistics (FIXED Bugs Only) ===\n")
    for project, data in processor.all_data.items():
        severe_count = sum(1 for report in data.values()
                          if report.get('severity_mapped') == 'Severe')
        total_count = len(data)
        if total_count > 0:
            severe_percentage = (severe_count / total_count) * 100
            print(f"{project}: {severe_count}/{total_count} "
                  f"({severe_percentage:.1f}%) classified as Severe")
            print(f"Total FIXED bugs: {total_count}")

if __name__ == "__main__":
    main()


=== Raw Severity Labels Analysis (FIXED Bugs Only) ===

Total unique severity labels found: 6

All unique severity labels:
- blocker              -> Severe
- critical             -> Severe
- major                -> Severe
- minor                -> Non-Severe
- normal               -> Non-Severe
- trivial              -> Non-Severe

Label distribution by project (FIXED bugs only):

Bugzilla:
  - normal               :  1033 ( 42.48%) -> Non-Severe
  - minor                :   492 ( 20.23%) -> Non-Severe
  - trivial              :   289 ( 11.88%) -> Non-Severe
  - blocker              :   265 ( 10.90%) -> Severe
  - major                :   253 ( 10.40%) -> Severe
  - critical             :   100 (  4.11%) -> Severe

CDT:
  - normal               :  3539 ( 83.51%) -> Non-Severe
  - major                :   303 (  7.15%) -> Severe
  - minor                :   194 (  4.58%) -> Non-Severe
  - critical             :    89 (  2.10%) -> Severe
  - trivial              :    70 (  1.65%) -> Non

In [None]:
from zipfile import ZipFile
with ZipFile('/content/test1.zip', 'r') as zip_ref:
    zip_ref.extractall('unzipped_data')


In [None]:
# Mount Google Drive #Skip
from google.colab import drive
drive.mount('/content/drive')

# Define the path to your ZIP file in Google Drive
zip_file_path = '/content/test1.zip'

# Destination directory where you want to extract the ZIP file
destination_directory = '/content/unzipped_data/'

# Import the necessary libraries
import zipfile
import os

# Create the destination directory if it doesn't exist
os.makedirs(destination_directory, exist_ok=True)

# Extract the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(destination_directory)

# Check the content of the destination directory
os.listdir(destination_directory)




import os
import xml.etree.ElementTree as ET
from zipfile import ZipFile

# List of project names to process
projects = ['Bugzilla', 'CDT', 'Core', 'Firefox', 'JDT', 'PDE', 'Platform', 'Thunderbird']

# Dictionary to store the extracted data
all_data = {}

# Function to convert severity labels
def convert_severity(severity):
    # Add your criteria to convert severity labels to "Severe" or "Non-Severe"
    if severity in ['critical', 'blocker', 'major']:
        return 'Severe'
    else:
        return 'Non-Severe'

for project in projects:
    project_folder = f"unzipped_data/{project}"

    # Define the XML file names for each project
    xml_files = ['short_desc.xml', 'severity.xml']

    # Dictionary to store the extracted data for the current project
    project_data = {}

    for xml_file in xml_files:
        file_path = os.path.join(project_folder, xml_file)

        if os.path.exists(file_path):
            tree = ET.parse(file_path)
            root = tree.getroot()

            if xml_file == 'short_desc.xml':
                reports = root.findall('.//report')
            elif xml_file == 'severity.xml':
                severities = root.findall('.//report')

    for report in reports:
        report_id = report.get('id')
        last_update = report.find('.//update[last()]')

        if last_update is not None:
            when = last_update.find('when').text
            what = last_update.find('what').text

            if report_id not in project_data:
                project_data[report_id] = {}
            project_data[report_id]['when'] = when
            project_data[report_id]['what'] = what

    for severity in severities:
        report_id = severity.get('id')
        last_update = severity.find('.//update[last()]')

        if last_update is not None:
            last_what_element = last_update.find('what[last()]')
            if last_what_element is not None:
                severity_label = last_what_element.text
                converted_severity = convert_severity(severity_label)
                if report_id in project_data:
                    project_data[report_id]['severity'] = converted_severity

    # Store the data for the current project
    all_data[project] = project_data

# Now you have all the data for different projects in the 'all_data' dictionary
# You can use this data to train and validate your model as needed



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['Platform', 'Bugzilla', 'Core', 'Firefox', 'JDT', 'Thunderbird', 'CDT', 'PDE']

In [None]:
import xml.etree.ElementTree as ET

def extract_data_from_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    severity = root.find('severity').text
    summary = root.find('summary').text

    return severity, summary


In [None]:
#Skip
import random
project_folders = ['Bugzilla', 'CDT', 'Core', 'Firefox', 'JDT', 'PDE', 'Platform', 'Thunderbird']  # List of project folder names

# Shuffle the project list to ensure each project is tested exactly once
random.shuffle(project_folders)

# Number of iterations
num_iterations = 8  # You can adjust this as needed

for i in range(num_iterations):
    testing_project = project_folders[i]
    training_projects = project_folders[:i] + project_folders[i + 1:]

    print(f"Iteration {i + 1}:")
    print(f"Training Projects: {training_projects}")
    print(f"Testing Project: {testing_project}\n")

Iteration 1:
Training Projects: ['PDE', 'CDT', 'Core', 'Thunderbird', 'JDT', 'Bugzilla', 'Platform']
Testing Project: Firefox

Iteration 2:
Training Projects: ['Firefox', 'CDT', 'Core', 'Thunderbird', 'JDT', 'Bugzilla', 'Platform']
Testing Project: PDE

Iteration 3:
Training Projects: ['Firefox', 'PDE', 'Core', 'Thunderbird', 'JDT', 'Bugzilla', 'Platform']
Testing Project: CDT

Iteration 4:
Training Projects: ['Firefox', 'PDE', 'CDT', 'Thunderbird', 'JDT', 'Bugzilla', 'Platform']
Testing Project: Core

Iteration 5:
Training Projects: ['Firefox', 'PDE', 'CDT', 'Core', 'JDT', 'Bugzilla', 'Platform']
Testing Project: Thunderbird

Iteration 6:
Training Projects: ['Firefox', 'PDE', 'CDT', 'Core', 'Thunderbird', 'Bugzilla', 'Platform']
Testing Project: JDT

Iteration 7:
Training Projects: ['Firefox', 'PDE', 'CDT', 'Core', 'Thunderbird', 'JDT', 'Platform']
Testing Project: Bugzilla

Iteration 8:
Training Projects: ['Firefox', 'PDE', 'CDT', 'Core', 'Thunderbird', 'JDT', 'Bugzilla']
Testing Pro

In [None]:
import os
import xml.etree.ElementTree as ET

# Define the project folder and XML file names
project_folder = "CDT"  # Replace with the project folder you want to process
xml_files = ['assigned_to.xml', 'bug_status.xml', 'cc.xml', 'component.xml', 'short_desc.xml', 'severity.xml']

# Dictionary to store the extracted data
data = {}

# Loop through the XML files and extract severity and short_description
for xml_file in xml_files:
    file_path = os.path.join(project_folder, xml_file)

    if os.path.exists(file_path):
        tree = ET.parse(file_path)
        root = tree.getroot()

        severity_element = root.find('severity')
        short_desc_element = root.find('short_desc')

        severity = severity_element.text if severity_element is not None else "N/A"
        short_desc = short_desc_element.text if short_desc_element is not None else "N/A"

        data[xml_file] = {'severity': severity, 'short_description': short_desc}

# Print the extracted data
for file, content in data.items():
    print(f"File: {file}")
    print(f"Severity: {content['severity']}")
    print(f"Short Description: {content['short_description']}")
    print()


In [None]:
import os
import xml.etree.ElementTree as ET
from zipfile import ZipFile

# Extract the ZIP file
with ZipFile('test1.zip', 'r') as zip_ref:
    zip_ref.extractall('unzipped_data')

# Define the project folder and XML file names
project_folder = "unzipped_data/CDT"  # Replace with the project folder you want to process
xml_files = ['assigned_to.xml', 'bug_status.xml', 'cc.xml', 'component.xml', 'short_desc.xml', 'severity.xml']

# Dictionary to store the extracted data
data = {}

# Loop through the XML files and extract severity and short_description
for xml_file in xml_files:
    file_path = os.path.join(project_folder, xml_file)

    if os.path.exists(file_path):
        tree = ET.parse(file_path)
        root = tree.getroot()

        severity_element = root.find('severity')
        short_desc_element = root.find('short_desc')

        severity = severity_element.text if severity_element is not None else "N/A"
        short_desc = short_desc_element.text if short_desc_element is not None else "N/A"

        data[xml_file] = {'severity': severity, 'short_description': short_desc}

# Print the extracted data
for file, content in data.items():
    print(f"File: {file}")
    print(f"Severity: {content['severity']}")
    print(f"Short Description: {content['short_description']}")
    print()


File: assigned_to.xml
Severity: N/A
Short Description: N/A

File: bug_status.xml
Severity: N/A
Short Description: N/A

File: cc.xml
Severity: N/A
Short Description: N/A

File: component.xml
Severity: N/A
Short Description: N/A

File: short_desc.xml
Severity: N/A
Short Description: N/A

File: severity.xml
Severity: N/A
Short Description: N/A



In [None]:
import os
import xml.etree.ElementTree as ET
from zipfile import ZipFile

# Extract the ZIP file
with ZipFile('test1.zip', 'r') as zip_ref:
    zip_ref.extractall('unzipped_data')

# Define the project folder and XML file names
project_folder = "unzipped_data/Bugzilla"  # Replace with the project folder you want to process
xml_files = ['short_desc.xml']  # Use only the short_desc.xml file

# Dictionary to store the extracted data
data = {}

# Loop through the XML files and extract the last <update> tag for each report
for xml_file in xml_files:
    file_path = os.path.join(project_folder, xml_file)

    if os.path.exists(file_path):
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Find all <report> elements
        reports = root.findall('.//report')

        for report in reports:
            report_id = report.get('id')
            last_update = report.find('.//update[last()]')

            if last_update is not None:
                when = last_update.find('when').text
                what = last_update.find('what').text

                data[report_id] = {'when': when, 'what': what}

# Print the extracted data
for report_id, content in data.items():
    print(f"Report ID: {report_id}")
    print(f"When: {content['when']}")
    print(f"What: {content['what']}")
    print()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Report ID: 536190
When: 1261382350
What: Migration from 3.0.4 to 3.4.4: Problem with user member of many groups (more than 500)

Report ID: 536364
When: 1261458281
What: wrtw

Report ID: 536553
When: 1261549211
What: Change case of default status and workflow names from UPPERCASE to InitialCaps

Report ID: 536583
When: 1261563117
What: "Features" underlines on mouseover

Report ID: 536589
When: 1261564854
What: No ability to manually set language

Report ID: 537083
When: 1262059084
What: The attachment table is too narrow by default

Report ID: 537111
When: 1262075348
What: help link is broken, excludes locale..

Report ID: 537295
When: 1262168751
What: Patch to fix t/008filter.t error for template/en/default/reports/duplicates.rdf.tmpl

Report ID: 537328
When: 1262184848
What: Clicking 'reply' shouldn't add anything to my history

Report ID: 537746
When: 1262597564
What: Search Criteria Headers on Buglist Duplicate Unnec

In [None]:
# Displays Each Project Short Description and Severity Label

import os
import xml.etree.ElementTree as ET
from zipfile import ZipFile

# Extract the ZIP file
with ZipFile('test1.zip', 'r') as zip_ref:
    zip_ref.extractall('unzipped_data')

# Define the project folder and XML file names
project_folder = "unzipped_data/Bugzilla"
xml_files = ['short_desc.xml', 'severity.xml']

# Dictionary to store the extracted data
data = {}

# Extract the last <update> tag for reports
for xml_file in xml_files:
    file_path = os.path.join(project_folder, xml_file)

    if os.path.exists(file_path):
        tree = ET.parse(file_path)
        root = tree.getroot()

        if xml_file == 'short_desc.xml':
            reports = root.findall('.//report')
        elif xml_file == 'severity.xml':
            severities = root.findall('.//report')

# Extract the last update for reports
for report in reports:
    report_id = report.get('id')
    last_update = report.find('.//update[last()]')

    if last_update is not None:
        when = last_update.find('when').text
        what = last_update.find('what').text

        if report_id not in data:
            data[report_id] = {}
        data[report_id]['when'] = when
        data[report_id]['what'] = what

# Extract the severity data
for severity in severities:
    report_id = severity.get('id')
    last_update = severity.find('.//update[last()]')

    if last_update is not None:
        last_what_element = last_update.find('what[last()]')
        if last_what_element is not None:
            what = last_what_element.text
            if report_id in data:
                data[report_id]['severity'] = what

# Print the extracted data
for report_id, content in data.items():
    print(f"Report ID: {report_id}")
    print(f"When: {content['when']}")
    print(f"What: {content['what']}")
    print(f"Severity: {content.get('severity', 'N/A')}")
    print()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Report ID: 573450
When: 1277083196
What: keyworddefs.description should be NOT NULL
Severity: normal

Report ID: 573451
When: 1277083789
What: Customizing Bugzilla whine e-mail template
Severity: normal

Report ID: 574029
When: 1277280903
What: The red star besides the "Component" label in show_bug.cgi has no meaning
Severity: normal

Report ID: 574166
When: 1277297505
What: clean_search_url needs to take into account email3 fields.
Severity: normal

Report ID: 574177
When: 1277300059
What: product and version fields should also have the is_mandatory => 1 option in the fielddefs
Severity: minor

Report ID: 574327
When: 1277349186
What: Map image/x-png to image/png when uploaded by IE
Severity: minor

Report ID: 574566
When: 1277407485
What: "reporter" is missing from OPERATOR_FIELD_OVERRIDE in Bugzilla::Search
Severity: normal

Report ID: 574892
When: 1277485739
What: Initial CC and Default QA Contact not set on bugs if t

In [None]:
import os
import xml.etree.ElementTree as ET
from zipfile import ZipFile

# List of project names to process
projects = ['Bugzilla', 'CDT', 'Core', 'Firefox', 'JDT', 'PDE', 'Platform', 'Thunderbird']

# Dictionary to store the extracted data
all_data = {}

# Function to convert severity labels
def convert_severity(severity):
    # Add your criteria to convert severity labels to "Severe" or "Non-Severe"
    if severity in ['critical', 'blocker', 'major']:
        return 'Severe'
    else:
        return 'Non-Severe'

for project in projects:
    project_folder = f"unzipped_data/{project}"

    # Define the XML file names for each project
    xml_files = ['short_desc.xml', 'severity.xml']

    # Dictionary to store the extracted data for the current project
    project_data = {}

    for xml_file in xml_files:
        file_path = os.path.join(project_folder, xml_file)

        if os.path.exists(file_path):
            tree = ET.parse(file_path)
            root = tree.getroot()

            if xml_file == 'short_desc.xml':
                reports = root.findall('.//report')
            elif xml_file == 'severity.xml':
                severities = root.findall('.//report')

    for report in reports:
        report_id = report.get('id')
        last_update = report.find('.//update[last()]')

        if last_update is not None:
            when = last_update.find('when').text
            what = last_update.find('what').text

            if report_id not in project_data:
                project_data[report_id] = {}
            project_data[report_id]['when'] = when
            project_data[report_id]['what'] = what

    for severity in severities:
        report_id = severity.get('id')
        last_update = severity.find('.//update[last()]')

        if last_update is not None:
            last_what_element = last_update.find('what[last()]')
            if last_what_element is not None:
                severity_label = last_what_element.text
                converted_severity = convert_severity(severity_label)
                if report_id in project_data:
                    project_data[report_id]['severity'] = converted_severity

    # Store the data for the current project
    all_data[project] = project_data

# Now you have all the data for different projects in the 'all_data' dictionary
# You can use this data to train and validate your model as needed


In [None]:
# Choose a project from the list of projects to display a sample
sample_project = 'CDT'

if sample_project in all_data:
    project_data = all_data[sample_project]

    # Print a sample of the data
    sample_count = 5  # Number of samples to print
    count = 0

    for report_id, content in project_data.items():
        print(f"Report ID: {report_id}")
        print(f"When: {content['when']}")
        print(f"What: {content['what']}")
        print(f"Severity: {content.get('severity', 'N/A')}")
        print()

        count += 1
        if count >= sample_count:
            break
else:
    print(f"Data for project '{sample_project}' not found.")


Report ID: 126211
When: 1138878213
What: Consecutive C prog. runs -> Error logged from Debug Core:
Severity: Non-Severe

Report ID: 126262
When: 1138895064
What: Manual change in Memory or Variables view is not propagated to Expressions view
Severity: Non-Severe

Report ID: 127262
When: 1140770667
What: [Preferences] Consumers of File/Directory FieldEditor values need to quote/escape
Severity: Non-Severe

Report ID: 126025
When: 1140427188
What: CApplicationLaunchShortcut call to DebugUITools.saveAndBuild invokes workspace wide build unnecessarily
Severity: Non-Severe

Report ID: 128667
When: 1140666379
What: unpredictable switching between "all" and "clean all"
Severity: Severe



In [None]:
# Initialize dictionaries to store counts
bug_report_counts = {}
severe_counts = {}
non_severe_counts = {}

# Function to convert severity labels to 0 (non-severe) or 1 (severe)
def convert_severity_to_binary(severity):
    if severity == 'Severe':
        return 1
    else:
        return 0

for project, project_data in all_data.items():
    bug_report_counts[project] = len(project_data)

    severe_count = 0
    non_severe_count = 0

    for report_id, content in project_data.items():
        severity = content.get('severity', 'Non-Severe')  # Default to 'Non-Severe' if not present

        # Convert severity to 0 (Non-Severe) or 1 (Severe)
        binary_severity = convert_severity_to_binary(severity)

        if binary_severity == 1:
            severe_count += 1
        else:
            non_severe_count += 1

    severe_counts[project] = severe_count
    non_severe_counts[project] = non_severe_count

# Print the counts
for project in projects:
    print(f"Project: {project}")
    print(f"Total Bug Reports: {bug_report_counts.get(project, 0)}")
    print(f"Severe Bug Reports: {severe_counts.get(project, 0)}")
    print(f"Non-Severe Bug Reports: {non_severe_counts.get(project, 0)}")
    print()


Project: Bugzilla
Total Bug Reports: 4616
Severe Bug Reports: 957
Non-Severe Bug Reports: 3659

Project: CDT
Total Bug Reports: 5640
Severe Bug Reports: 734
Non-Severe Bug Reports: 4906

Project: Core
Total Bug Reports: 74292
Severe Bug Reports: 15236
Non-Severe Bug Reports: 59056

Project: Firefox
Total Bug Reports: 69879
Severe Bug Reports: 16322
Non-Severe Bug Reports: 53557

Project: JDT
Total Bug Reports: 10814
Severe Bug Reports: 1368
Non-Severe Bug Reports: 9446

Project: PDE
Total Bug Reports: 5655
Severe Bug Reports: 640
Non-Severe Bug Reports: 5015

Project: Platform
Total Bug Reports: 24775
Severe Bug Reports: 4122
Non-Severe Bug Reports: 20653

Project: Thunderbird
Total Bug Reports: 19237
Severe Bug Reports: 4941
Non-Severe Bug Reports: 14296



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Combine the data from different projects
all_bug_reports = []
all_severities = []

for project, project_data in all_data.items():
    for report_id, content in project_data.items():
        short_desc = content['what']
        severity = content.get('severity', 'Non-Severe')  # Default to 'Non-Severe'

        if short_desc is not None:  # Check for None values
            all_bug_reports.append(short_desc)
            binary_severity = 1 if severity == 'Severe' else 0
            all_severities.append(binary_severity)




# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_bug_reports[0:200], all_severities[0:200], test_size=0.2, random_state=42)

if X_train:  # Check if X_train is not empty
    # Vectorize the text data using TF-IDF
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    # Train a simple classifier (e.g., Multinomial Naive Bayes)
    classifier = MultinomialNB()
    classifier.fit(X_train_tfidf, y_train)

    # Evaluate the model
    y_pred = classifier.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

    print("Classification Report:")
    print(classification_report(y_test, y_pred))
else:
    print("No valid data to train the model. Please check your input data.")


Accuracy: 0.875
Classification Report:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        34
           1       1.00      0.17      0.29         6

    accuracy                           0.88        40
   macro avg       0.94      0.58      0.61        40
weighted avg       0.89      0.88      0.83        40



In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m100.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
Co

In [None]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/261.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from transformers import AdamW, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, classification_report




import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
#SKIP
if X_train:  # Check if X_train is not empty
    # Tokenize your data
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encodings = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt')
    eval_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt')

    # Create a dataset and dataloader
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    dataset = CustomDataset(encodings, y_train)
    eval_dataset = CustomDataset(eval_encodings, y_test)

    data_collator = DataCollatorWithPadding(tokenizer)

    # Fine-tune the model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.to(device)  # Move the model to the GPU

    optimizer = AdamW(model.parameters(), lr=1e-4)  # Increased learning rate

    # Define training parameters
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,  # Increased batch size
        gradient_accumulation_steps=2,  # Gradient accumulation
        evaluation_strategy='steps',
        eval_steps=100,  # Reduced evaluation frequency
        save_total_limit=2,
        save_steps=1,  # Save the model every epoch
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        compute_metrics=lambda p: accuracy_score(predicted_labels, y_test),
    )

    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained("fine_tuned_bert_model")

    # Make predictions on your data
    predicted_labels = trainer.predict(eval_dataset).predictions.argmax(-1)
    accuracy = accuracy_score(y_test, predicted_labels)
    report = classification_report(y_test, predicted_labels)
    print(f"Accuracy: {accuracy}")
    print(report)
else:
    print("No valid data to train the model. Please check your input data.")


In [None]:
#SKIP
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Combine the data from different projects
all_bug_reports = []
all_severities = []

for project, project_data in all_data.items():
    for report_id, content in project_data.items():
        short_desc = content.get('what', '')  # Use get() to handle missing values
        severity = content.get('severity', 'Non-Severe')  # Default to 'Non-Severe'

        if short_desc:  # Check for non-empty strings
            all_bug_reports.append(short_desc)
            binary_severity = 1 if severity == 'Severe' else 0
            all_severities.append(binary_severity)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_bug_reports, all_severities, test_size=0.2, random_state=42)

if X_train:  # Check if X_train is not empty
    # Tokenize your data
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encodings = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt')
    eval_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt')

    # Move your tensors to the GPU
    encodings = {key: val.to('cuda') for key, val in encodings.items()}
    eval_encodings = {key: val.to('cuda') for key, val in eval_encodings.items()}
    y_train = torch.tensor(y_train, device='cuda')
    y_test = torch.tensor(y_test, device='cuda')

    # Create a dataset and dataloader
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx].clone().detach()  # Fix labels handling
            return item

        def __len__(self):
            return len(self.labels)

    dataset = CustomDataset(encodings, y_train)
    eval_dataset = CustomDataset(eval_encodings, y_test)

    # Fine-tune the model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.to('cuda')

    optimizer = AdamW(model.parameters(), lr=1e-5)

    # Define training parameters
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=2,
        evaluation_strategy='steps',
        eval_steps=1,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        eval_dataset=eval_dataset,
    )

    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained("fine_tuned_bert_model")

    # Make predictions on your data
    predicted_labels = []

    for batch in torch.utils.data.DataLoader(eval_dataset, batch_size=2, shuffle=False):
        with torch.no_grad():
            outputs = model(**batch)
            logits = outputs.logits
            predicted_labels.extend(logits.argmax(dim=1).tolist())

    accuracy = accuracy_score(y_test.cpu(), predicted_labels)
    report = classification_report(y_test.cpu(), predicted_labels)
    print(f"Accuracy: {accuracy}")
    print(report)
else:
    print("No valid data to train the model. Please check your input data.")


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Combine the data from different projects
all_bug_reports = []
all_severities = []

for project, project_data in all_data.items():
    for report_id, content in project_data.items():
        short_desc = content.get('what', '')  # Use get() to handle missing values
        severity = content.get('severity', 'Non-Severe')  # Default to 'Non-Severe'

        if short_desc:  # Check for non-empty strings
            all_bug_reports.append(short_desc)
            binary_severity = 1 if severity == 'Severe' else 0
            all_severities.append(binary_severity)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_bug_reports, all_severities, test_size=0.2, random_state=42)



In [None]:
if X_train:  # Check if X_train is not empty
    # Tokenize your data
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encodings = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt')
    eval_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt')

    # Move your labels to the GPU
    y_train = y_train.to('cuda').clone().detach()
    y_test = y_test.to('cuda').clone().detach()

    # Create a dataset and dataloader
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item

        def __len__(self):
            return len(self.labels)

    dataset = CustomDataset(encodings, y_train)
    eval_dataset = CustomDataset(eval_encodings, y_test)

    # Fine-tune the model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.to('cuda')

    optimizer = AdamW(model.parameters(), lr=1e-5)

    # Define training parameters
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=2,
        evaluation_strategy='steps',
        eval_steps=1,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        eval_dataset=eval_dataset,
    )

    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained("fine_tuned_bert_model")

    # Make predictions on your data
    predicted_labels = []

    for batch in torch.utils.data.DataLoader(eval_dataset, batch_size=2, shuffle=False):
        with torch.no_grad():
            batch = {key: val.to('cuda') for key, val in batch.items()}  # Move batch to GPU
            outputs = model(**batch)
            logits = outputs.logits
            predicted_labels.extend(logits.argmax(dim=1).tolist())

    accuracy = accuracy_score(y_test.cpu(), predicted_labels)
    report = classification_report(y_test.cpu(), predicted_labels)
    print(f"Accuracy: {accuracy}")
    print(report)
else:
    print("No valid data to train the model. Please check your input data.")


NameError: ignored