Spaces:
Running
Running
Add annotations
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +98 -0
- configs.py +40 -0
- costum_datasets.py +67 -0
- datasets/train2014/COCO_train2014_000000000009.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000025.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000030.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000034.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000036.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000049.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000061.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000064.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000071.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000072.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000077.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000078.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000081.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000086.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000089.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000092.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000094.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000109.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000110.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000113.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000127.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000138.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000142.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000144.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000149.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000151.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000154.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000165.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000194.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000201.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000247.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000250.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000260.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000263.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000307.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000308.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000309.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000312.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000315.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000321.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000322.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000326.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000332.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000349.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000368.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000370.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000382.jpg +0 -0
app.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Construct pairs of text and image
|
2 |
+
from configs import CFG
|
3 |
+
from costum_datasets import make_pairs
|
4 |
+
|
5 |
+
|
6 |
+
from text_image_audio import OneEncoder
|
7 |
+
import torch
|
8 |
+
|
9 |
+
import gradio as gr
|
10 |
+
|
11 |
+
import torchaudio
|
12 |
+
|
13 |
+
# Construct pairs of text and image
|
14 |
+
training_pairs = make_pairs(CFG.train_annotation_file, CFG.image_dir, 5) # 413.915 -> 82.783 images
|
15 |
+
|
16 |
+
# Sorted according images
|
17 |
+
training_pairs = sorted(training_pairs, key=lambda x: x[0])
|
18 |
+
|
19 |
+
coco_images, coco_captions = zip(*training_pairs)
|
20 |
+
|
21 |
+
# Take unique images
|
22 |
+
unique_images = set()
|
23 |
+
unique_pairs = [(item[0], item[1]) for item in training_pairs if item[0] not in unique_images
|
24 |
+
and not unique_images.add(item[0])]
|
25 |
+
coco_images, _ = zip(*unique_pairs)
|
26 |
+
|
27 |
+
# Load model
|
28 |
+
model = OneEncoder.from_pretrained("bilalfaye/OneEncoder-text-image-audio")
|
29 |
+
|
30 |
+
# Load coco image features
|
31 |
+
coco_image_features = torch.load("image_embeddings_best.pt", map_location=CFG.device)
|
32 |
+
coco_image_features = coco_image_features[:3000]
|
33 |
+
|
34 |
+
def text_image(query):
|
35 |
+
model.text_image_encoder.image_retrieval(query,
|
36 |
+
image_paths=coco_images,
|
37 |
+
image_embeddings=coco_image_features,
|
38 |
+
n=9,
|
39 |
+
plot=True,
|
40 |
+
temperature=0.0
|
41 |
+
)
|
42 |
+
return "img.png"
|
43 |
+
|
44 |
+
def audio_image(query):
|
45 |
+
# Load the audio with torchaudio (returns tensor and sample rate)
|
46 |
+
waveform, sample_rate = torchaudio.load(query)
|
47 |
+
|
48 |
+
# Check if audio is stereo
|
49 |
+
if waveform.shape[0] > 1: # Stereo (2 channels)
|
50 |
+
# Convert stereo to mono: sum the left and right channels and divide by 2
|
51 |
+
mono_audio = waveform.mean(dim=0, keepdim=True)
|
52 |
+
else:
|
53 |
+
# Audio is already mono
|
54 |
+
mono_audio = waveform
|
55 |
+
|
56 |
+
# Resample to 16000 Hz if not already
|
57 |
+
if sample_rate != 16000:
|
58 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
59 |
+
mono_audio = resampler(mono_audio)
|
60 |
+
|
61 |
+
# Convert to numpy array for pipeline processing (if required)
|
62 |
+
mono_audio = mono_audio.squeeze(0).numpy()
|
63 |
+
|
64 |
+
audio_encoding = model.process_audio([mono_audio])
|
65 |
+
|
66 |
+
model.image_retrieval(audio_encoding,
|
67 |
+
image_paths=coco_images,
|
68 |
+
image_embeddings=coco_image_features,
|
69 |
+
n=9,
|
70 |
+
plot=True,
|
71 |
+
temperature=0.0,
|
72 |
+
display_audio=False)
|
73 |
+
|
74 |
+
return "img.png"
|
75 |
+
|
76 |
+
|
77 |
+
# Updated Gradio Interface
|
78 |
+
iface = gr.TabbedInterface(
|
79 |
+
[
|
80 |
+
gr.Interface(
|
81 |
+
fn=text_image,
|
82 |
+
inputs=gr.Textbox(label="Text Query"),
|
83 |
+
outputs="image",
|
84 |
+
title="Retrieve images using text as query",
|
85 |
+
description="Implementation of OneEncoder using one layer on UP for light demo, Only coco train dataset is used in this example (3000 images)."
|
86 |
+
),
|
87 |
+
gr.Interface(
|
88 |
+
fn=audio_image,
|
89 |
+
inputs=gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio Query"),
|
90 |
+
outputs="image",
|
91 |
+
title="Retrieve images using audio as query",
|
92 |
+
description="Implementation of OneEncoder using one layer on UP for light demo, Only coco train dataset is used in this example (3000 images)."
|
93 |
+
)
|
94 |
+
],
|
95 |
+
tab_names=["Text - Image", "Audio - Image"]
|
96 |
+
)
|
97 |
+
|
98 |
+
iface.launch(debug=True, share=True)
|
configs.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
|
4 |
+
################################################### PARMETERS ##########################################################
|
5 |
+
################################################# PARAMETERS ###########################################################
|
6 |
+
|
7 |
+
class CFG:
|
8 |
+
max_length = 128
|
9 |
+
batch_size = 32
|
10 |
+
num_workers = 4
|
11 |
+
projection_dim = 768
|
12 |
+
dropout_rate = 0.1
|
13 |
+
num_head = 4
|
14 |
+
num_layers = 1
|
15 |
+
image_encoder_lr = 1e-4
|
16 |
+
radio_encoder_lr = 1e-5
|
17 |
+
video_encoder_lr = 1e-4
|
18 |
+
text_encoder_lr = 1e-5
|
19 |
+
audio_encoder_lr = 1e-5
|
20 |
+
modality_token_encoder_lr = 1e-3
|
21 |
+
universal_projection_lr = 1e-3
|
22 |
+
lr = 1e-3
|
23 |
+
weight_decay = 1e-3
|
24 |
+
patience = 10
|
25 |
+
factor = 0.8
|
26 |
+
token_size = 1
|
27 |
+
epochs = 100
|
28 |
+
image_size = 224
|
29 |
+
device = "cpu"
|
30 |
+
data_directory = "datasets"
|
31 |
+
train_annotation_file = os.path.join(data_directory, "annotations", "captions_train2014.json")
|
32 |
+
val_annotation_file = os.path.join(data_directory, "annotations", "captions_val2014.json")
|
33 |
+
image_dir = os.path.join(data_directory, "train2014")
|
34 |
+
image_dir_val = os.path.join(data_directory, "val2014")
|
35 |
+
bert_name = "bert-base-uncased"
|
36 |
+
vit_name = "vit_base_patch16_224"
|
37 |
+
audio_name = "facebook/wav2vec2-base-960h"
|
38 |
+
radio_name = "microsoft/rad-dino"
|
39 |
+
video_name = "MCG-NJU/videomae-base"
|
40 |
+
sample_rate = 16000
|
costum_datasets.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from configs import CFG
|
2 |
+
import os
|
3 |
+
import requests
|
4 |
+
import zipfile
|
5 |
+
from pycocotools.coco import COCO
|
6 |
+
import torch
|
7 |
+
import cv2
|
8 |
+
import albumentations as A
|
9 |
+
import soundfile as sf
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
# Load Coco dataset
|
14 |
+
def download_dataset(data_dir="../datasets"):
|
15 |
+
# Create caption and image directories
|
16 |
+
annotations_dir = os.path.join(data_dir, "annotations")
|
17 |
+
images_dir = os.path.join(data_dir, "train2014")
|
18 |
+
|
19 |
+
# Download annotations (captions)
|
20 |
+
zip_file = os.path.join(annotations_dir, "annotations.zip")
|
21 |
+
url = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
|
22 |
+
response = requests.get(url, stream=True)
|
23 |
+
# write chunk in zip file
|
24 |
+
with open(zip_file, "wb") as f:
|
25 |
+
# 8192 = 8KB chunks (block or piece of data)
|
26 |
+
for chunk in response.iter_content(chunk_size=8192):
|
27 |
+
f.write(chunk)
|
28 |
+
# unzip file
|
29 |
+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
|
30 |
+
zip_ref.extractall(data_dir) # Extract all contents to the specified directory
|
31 |
+
os.remove(zip_file)
|
32 |
+
|
33 |
+
# Download train images
|
34 |
+
zip_file = os.path.join(images_dir, "train2014.zip")
|
35 |
+
url = "http://images.cocodataset.org/zips/train2014.zip"
|
36 |
+
response = requests.get(url, stream=True)
|
37 |
+
# write chunk in zip file
|
38 |
+
with open(zip_file, "wb") as f:
|
39 |
+
for chunk in response.iter_content(chunk_size=8192):
|
40 |
+
f.write(chunk)
|
41 |
+
# unzip file
|
42 |
+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
|
43 |
+
zip_ref.extractall(data_dir) # Extract all contents to the specified directory
|
44 |
+
os.remove(zip_file)
|
45 |
+
|
46 |
+
# Download val images
|
47 |
+
images_dir = os.path.join(data_dir, "val2014")
|
48 |
+
zip_file = os.path.join(images_dir, "val2014.zip")
|
49 |
+
url = "http://images.cocodataset.org/zips/val2014.zip"
|
50 |
+
response = requests.get(url, stream=True)
|
51 |
+
# write chunk in zip file
|
52 |
+
with open(zip_file, "wb") as f:
|
53 |
+
for chunk in response.iter_content(chunk_size=8192):
|
54 |
+
f.write(chunk)
|
55 |
+
# unzip file
|
56 |
+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
|
57 |
+
zip_ref.extractall(data_dir) # Extract all contents to the specified directory
|
58 |
+
os.remove(zip_file)
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
def make_pairs(annotation_json_files, image_dir, max_captions=3):
|
63 |
+
|
64 |
+
images = os.listdir(annotation_json_files)
|
65 |
+
image_caption = [(os.path.join(annotation_json_files, image), "an image") for image in images]
|
66 |
+
return image_caption
|
67 |
+
|
datasets/train2014/COCO_train2014_000000000009.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000025.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000030.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000034.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000036.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000049.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000061.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000064.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000071.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000072.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000077.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000078.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000081.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000086.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000089.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000092.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000094.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000109.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000110.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000113.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000127.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000138.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000142.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000144.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000149.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000151.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000154.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000165.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000194.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000201.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000247.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000250.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000260.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000263.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000307.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000308.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000309.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000312.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000315.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000321.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000322.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000326.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000332.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000349.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000368.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000370.jpg
ADDED
![]() |
datasets/train2014/COCO_train2014_000000000382.jpg
ADDED
![]() |