ftshijt commited on
Commit
7ee1777
·
1 Parent(s): e85c61b

add a full go-through for nltk permission

Browse files
Files changed (3) hide show
  1. Dockerfile +17 -8
  2. app.py +26 -13
  3. fix_nltk_permissions.py +81 -0
Dockerfile CHANGED
@@ -1,7 +1,5 @@
1
  FROM python:3.9-slim
2
 
3
- USER root
4
-
5
  WORKDIR /app
6
 
7
  # Install system dependencies
@@ -13,6 +11,13 @@ RUN apt-get update && apt-get install -y \
13
  && apt-get clean \
14
  && rm -rf /var/lib/apt/lists/*
15
 
 
 
 
 
 
 
 
16
  # Copy requirements file
17
  COPY requirements.txt .
18
 
@@ -20,13 +25,22 @@ COPY requirements.txt .
20
  RUN pip install --no-cache-dir -U pip && \
21
  pip install --no-cache-dir -r requirements.txt
22
 
 
 
 
 
23
  # Clone VERSA repository
24
  RUN git clone https://github.com/wavlab-speech/versa.git && \
25
  cd versa && \
26
  pip install -e .
27
 
28
  # Set up data directories
29
- RUN mkdir -p /app/data/configs /app/data/uploads /app/data/results
 
 
 
 
 
30
 
31
  # Copy universal metrics YAML file
32
  COPY universal_metrics.yaml /app/data/configs/
@@ -49,10 +63,5 @@ RUN mkdir -m 777 /tmp/NUMBA_CACHE_DIR /tmp/MPLCONFIGDIR
49
  ENV NUMBA_CACHE_DIR=/tmp/NUMBA_CACHE_DIR/
50
  ENV MPLCONFIGDIR=/tmp/MPLCONFIGDIR/
51
 
52
- # Pre-download NLTK data
53
- ENV NLTK_DATA /app/nltk_data
54
- RUN python -m nltk.downloader -d /app/nltk_data punkt stopwords wordnet
55
-
56
-
57
  # Run the application
58
  CMD ["python", "app.py"]
 
1
  FROM python:3.9-slim
2
 
 
 
3
  WORKDIR /app
4
 
5
  # Install system dependencies
 
11
  && apt-get clean \
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
+ # Create NLTK data directory with correct permissions
15
+ RUN mkdir -p /usr/local/share/nltk_data && \
16
+ chmod -R 777 /usr/local/share/nltk_data
17
+
18
+ # Set NLTK_DATA environment variable
19
+ ENV NLTK_DATA=/usr/local/share/nltk_data
20
+
21
  # Copy requirements file
22
  COPY requirements.txt .
23
 
 
25
  RUN pip install --no-cache-dir -U pip && \
26
  pip install --no-cache-dir -r requirements.txt
27
 
28
+ # Pre-download NLTK data
29
+ RUN python -c "import nltk; nltk.download('punkt', download_dir='$NLTK_DATA'); nltk.download('stopwords', download_dir='$NLTK_DATA'); nltk.download('wordnet', download_dir='$NLTK_DATA')"
30
+
31
+
32
  # Clone VERSA repository
33
  RUN git clone https://github.com/wavlab-speech/versa.git && \
34
  cd versa && \
35
  pip install -e .
36
 
37
  # Set up data directories
38
+ RUN mkdir -p /app/data/configs /app/data/uploads /app/data/results && \
39
+ chmod -R 777 /app/data
40
+
41
+ # Copy and run the NLTK fix script
42
+ COPY fix_nltk_permissions.py .
43
+ RUN python fix_nltk_permissions.py
44
 
45
  # Copy universal metrics YAML file
46
  COPY universal_metrics.yaml /app/data/configs/
 
63
  ENV NUMBA_CACHE_DIR=/tmp/NUMBA_CACHE_DIR/
64
  ENV MPLCONFIGDIR=/tmp/MPLCONFIGDIR/
65
 
 
 
 
 
 
66
  # Run the application
67
  CMD ["python", "app.py"]
app.py CHANGED
@@ -11,6 +11,12 @@ import matplotlib.pyplot as plt
11
  import time
12
  from pathlib import Path
13
 
 
 
 
 
 
 
14
  # VERSA paths - these are set by the Dockerfile
15
  VERSA_ROOT = "/app/versa"
16
  VERSA_BIN = os.path.join(VERSA_ROOT, "versa", "bin", "scorer.py")
@@ -22,20 +28,18 @@ UPLOAD_DIR = os.path.join(DATA_DIR, "uploads")
22
  RESULTS_DIR = os.path.join(DATA_DIR, "results")
23
  CONFIG_DIR = os.path.join(DATA_DIR, "configs")
24
 
25
- # Set NLTK_DATA directory to a writable location within the container
26
- os.environ["NLTK_DATA"] = "/app/nltk_data"
27
- os.makedirs("/app/nltk_data", exist_ok=True)
 
 
28
 
29
- # Pre-download NLTK data during startup
30
- try:
31
- import nltk
32
- for package in ['punkt', 'stopwords', 'wordnet']:
33
- try:
34
- nltk.download(package, quiet=True, download_dir='/app/nltk_data')
35
- except:
36
- pass
37
- except Exception as e:
38
- print(f"Warning: NLTK setup failed: {e}")
39
 
40
  # Check if VERSA is installed
41
  def check_versa_installation():
@@ -305,6 +309,15 @@ def evaluate_audio(gt_file, pred_file, metric_config, include_timestamps=False):
305
 
306
  # Run VERSA evaluation
307
  try:
 
 
 
 
 
 
 
 
 
308
  process = subprocess.run(
309
  cmd,
310
  check=True,
 
11
  import time
12
  from pathlib import Path
13
 
14
+ # Set NLTK data directory - use the environment variable or default to a writable location
15
+ nltk_data_dir = os.environ.get("NLTK_DATA", "/usr/local/share/nltk_data")
16
+ os.environ["NLTK_DATA"] = nltk_data_dir
17
+ os.makedirs(nltk_data_dir, exist_ok=True)
18
+ print(f"NLTK data directory set to: {nltk_data_dir}")
19
+
20
  # VERSA paths - these are set by the Dockerfile
21
  VERSA_ROOT = "/app/versa"
22
  VERSA_BIN = os.path.join(VERSA_ROOT, "versa", "bin", "scorer.py")
 
28
  RESULTS_DIR = os.path.join(DATA_DIR, "results")
29
  CONFIG_DIR = os.path.join(DATA_DIR, "configs")
30
 
31
+ # Create directories if they don't exist
32
+ for directory in [DATA_DIR, UPLOAD_DIR, RESULTS_DIR, CONFIG_DIR]:
33
+ os.makedirs(directory, exist_ok=True)
34
+ # Ensure directories are writable
35
+ os.chmod(directory, 0o777)
36
 
37
+ # Debug information
38
+ print(f"Current user ID: {os.getuid()}")
39
+ print(f"Current effective user ID: {os.geteuid()}")
40
+ print(f"Current working directory: {os.getcwd()}")
41
+ print(f"Data directory permissions: {oct(os.stat(DATA_DIR).st_mode)}")
42
+ print(f"NLTK data directory permissions: {oct(os.stat(nltk_data_dir).st_mode) if os.path.exists(nltk_data_dir) else 'Not found'}")
 
 
 
 
43
 
44
  # Check if VERSA is installed
45
  def check_versa_installation():
 
309
 
310
  # Run VERSA evaluation
311
  try:
312
+
313
+ # Set environment variables for the subprocess
314
+ env = os.environ.copy()
315
+ env["LIBROSA_CACHE_DIR"] = "/tmp/librosa_cache"
316
+ env["LIBROSA_CACHE_LEVEL"] = "0"
317
+
318
+ # Pass through the NLTK_DATA environment variable
319
+ env["NLTK_DATA"] = nltk_data_dir
320
+
321
  process = subprocess.run(
322
  cmd,
323
  check=True,
fix_nltk_permissions.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to pre-download NLTK data to a writable location and patch
4
+ discrete_speech_metrics to use this location.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import nltk
10
+ import importlib.util
11
+ import subprocess
12
+
13
+ # Set NLTK data directory to a writable location
14
+ nltk_data_dir = "/usr/local/share/nltk_data"
15
+ os.makedirs(nltk_data_dir, exist_ok=True)
16
+ os.environ["NLTK_DATA"] = nltk_data_dir
17
+
18
+ print(f"Setting NLTK data directory to: {nltk_data_dir}")
19
+ print(f"Directory exists: {os.path.exists(nltk_data_dir)}")
20
+ print(f"Directory permissions: {oct(os.stat(nltk_data_dir).st_mode)}")
21
+ print(f"User ID: {os.getuid()}, Group ID: {os.getgid()}")
22
+
23
+ # Make the directory world-writable
24
+ try:
25
+ os.chmod(nltk_data_dir, 0o777)
26
+ print(f"Changed permissions on {nltk_data_dir} to 0o777")
27
+ except Exception as e:
28
+ print(f"Failed to change permissions: {e}")
29
+
30
+ # Pre-download necessary NLTK data
31
+ for package in ['punkt', 'stopwords', 'wordnet']:
32
+ print(f"Downloading NLTK package: {package}")
33
+ try:
34
+ nltk.download(package, download_dir=nltk_data_dir, quiet=False)
35
+ print(f"Successfully downloaded {package}")
36
+ except Exception as e:
37
+ print(f"Error downloading {package}: {e}")
38
+
39
+ # Try to modify the discrete_speech_metrics module to use our NLTK data dir
40
+ try:
41
+ # Find discrete_speech_metrics module
42
+ spec = importlib.util.find_spec("discrete_speech_metrics")
43
+ if spec is not None:
44
+ module_path = os.path.dirname(spec.origin)
45
+ print(f"Found discrete_speech_metrics at: {module_path}")
46
+
47
+ # Files to patch
48
+ files_to_patch = [
49
+ os.path.join(module_path, "speechbleu.py"),
50
+ os.path.join(module_path, "speechbert.py"),
51
+ os.path.join(module_path, "__init__.py")
52
+ ]
53
+
54
+ for file_path in files_to_patch:
55
+ if os.path.exists(file_path):
56
+ print(f"Patching file: {file_path}")
57
+
58
+ # Read file content
59
+ with open(file_path, 'r') as file:
60
+ content = file.read()
61
+
62
+ # Replace nltk.download with our custom version
63
+ if "nltk.download" in content:
64
+ patched_content = content.replace(
65
+ 'nltk.download(',
66
+ f'nltk.download(download_dir="{nltk_data_dir}", '
67
+ )
68
+
69
+ # Write patched content back
70
+ with open(file_path, 'w') as file:
71
+ file.write(patched_content)
72
+
73
+ print(f"Successfully patched {file_path}")
74
+ else:
75
+ print(f"No nltk.download calls found in {file_path}")
76
+ else:
77
+ print("discrete_speech_metrics module not found")
78
+ except Exception as e:
79
+ print(f"Error patching discrete_speech_metrics: {e}")
80
+
81
+ print("NLTK setup complete")