File size: 1,168 Bytes
3133b5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import os
from typing import List
from urllib.request import urlretrieve
from zipfile import ZipFile

import pandas as pd

from src.utils.logging_utils import get_pylogger

log = get_pylogger(__name__)


def filter_dataframe_and_get_column(
    dataframe: pd.DataFrame, filter_column: str, filter_value: str, select_column: str
) -> List[str]:
    return dataframe[dataframe[filter_column] == filter_value][select_column].tolist()


def download_and_unzip(
    url: str, target_path: str, force_download: bool = False, remove_tmp_file: bool = False
):
    log.warning(f"download zip file from {url} to {target_path} ...")
    if not (url.startswith("http://") or url.startswith("https://")):
        raise ValueError(f"url needs to point to a http(s) address, but it is: {url}")
    tmp_file = os.path.join(target_path, os.path.basename(url))
    if os.path.exists(tmp_file) and not force_download:
        log.warning(f"tmp file {tmp_file} already exists, skip downloading {url}")
    else:
        urlretrieve(url, tmp_file)  # nosec
    with ZipFile(tmp_file, "r") as zfile:
        zfile.extractall(target_path)
    if remove_tmp_file:
        os.remove(tmp_file)