bilalfaye commited on
Commit
7786bd6
·
1 Parent(s): 9fb0090

Add annotations

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +98 -0
  2. configs.py +40 -0
  3. costum_datasets.py +67 -0
  4. datasets/train2014/COCO_train2014_000000000009.jpg +0 -0
  5. datasets/train2014/COCO_train2014_000000000025.jpg +0 -0
  6. datasets/train2014/COCO_train2014_000000000030.jpg +0 -0
  7. datasets/train2014/COCO_train2014_000000000034.jpg +0 -0
  8. datasets/train2014/COCO_train2014_000000000036.jpg +0 -0
  9. datasets/train2014/COCO_train2014_000000000049.jpg +0 -0
  10. datasets/train2014/COCO_train2014_000000000061.jpg +0 -0
  11. datasets/train2014/COCO_train2014_000000000064.jpg +0 -0
  12. datasets/train2014/COCO_train2014_000000000071.jpg +0 -0
  13. datasets/train2014/COCO_train2014_000000000072.jpg +0 -0
  14. datasets/train2014/COCO_train2014_000000000077.jpg +0 -0
  15. datasets/train2014/COCO_train2014_000000000078.jpg +0 -0
  16. datasets/train2014/COCO_train2014_000000000081.jpg +0 -0
  17. datasets/train2014/COCO_train2014_000000000086.jpg +0 -0
  18. datasets/train2014/COCO_train2014_000000000089.jpg +0 -0
  19. datasets/train2014/COCO_train2014_000000000092.jpg +0 -0
  20. datasets/train2014/COCO_train2014_000000000094.jpg +0 -0
  21. datasets/train2014/COCO_train2014_000000000109.jpg +0 -0
  22. datasets/train2014/COCO_train2014_000000000110.jpg +0 -0
  23. datasets/train2014/COCO_train2014_000000000113.jpg +0 -0
  24. datasets/train2014/COCO_train2014_000000000127.jpg +0 -0
  25. datasets/train2014/COCO_train2014_000000000138.jpg +0 -0
  26. datasets/train2014/COCO_train2014_000000000142.jpg +0 -0
  27. datasets/train2014/COCO_train2014_000000000144.jpg +0 -0
  28. datasets/train2014/COCO_train2014_000000000149.jpg +0 -0
  29. datasets/train2014/COCO_train2014_000000000151.jpg +0 -0
  30. datasets/train2014/COCO_train2014_000000000154.jpg +0 -0
  31. datasets/train2014/COCO_train2014_000000000165.jpg +0 -0
  32. datasets/train2014/COCO_train2014_000000000194.jpg +0 -0
  33. datasets/train2014/COCO_train2014_000000000201.jpg +0 -0
  34. datasets/train2014/COCO_train2014_000000000247.jpg +0 -0
  35. datasets/train2014/COCO_train2014_000000000250.jpg +0 -0
  36. datasets/train2014/COCO_train2014_000000000260.jpg +0 -0
  37. datasets/train2014/COCO_train2014_000000000263.jpg +0 -0
  38. datasets/train2014/COCO_train2014_000000000307.jpg +0 -0
  39. datasets/train2014/COCO_train2014_000000000308.jpg +0 -0
  40. datasets/train2014/COCO_train2014_000000000309.jpg +0 -0
  41. datasets/train2014/COCO_train2014_000000000312.jpg +0 -0
  42. datasets/train2014/COCO_train2014_000000000315.jpg +0 -0
  43. datasets/train2014/COCO_train2014_000000000321.jpg +0 -0
  44. datasets/train2014/COCO_train2014_000000000322.jpg +0 -0
  45. datasets/train2014/COCO_train2014_000000000326.jpg +0 -0
  46. datasets/train2014/COCO_train2014_000000000332.jpg +0 -0
  47. datasets/train2014/COCO_train2014_000000000349.jpg +0 -0
  48. datasets/train2014/COCO_train2014_000000000368.jpg +0 -0
  49. datasets/train2014/COCO_train2014_000000000370.jpg +0 -0
  50. datasets/train2014/COCO_train2014_000000000382.jpg +0 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Construct pairs of text and image
2
+ from configs import CFG
3
+ from costum_datasets import make_pairs
4
+
5
+
6
+ from text_image_audio import OneEncoder
7
+ import torch
8
+
9
+ import gradio as gr
10
+
11
+ import torchaudio
12
+
13
+ # Construct pairs of text and image
14
+ training_pairs = make_pairs(CFG.train_annotation_file, CFG.image_dir, 5) # 413.915 -> 82.783 images
15
+
16
+ # Sorted according images
17
+ training_pairs = sorted(training_pairs, key=lambda x: x[0])
18
+
19
+ coco_images, coco_captions = zip(*training_pairs)
20
+
21
+ # Take unique images
22
+ unique_images = set()
23
+ unique_pairs = [(item[0], item[1]) for item in training_pairs if item[0] not in unique_images
24
+ and not unique_images.add(item[0])]
25
+ coco_images, _ = zip(*unique_pairs)
26
+
27
+ # Load model
28
+ model = OneEncoder.from_pretrained("bilalfaye/OneEncoder-text-image-audio")
29
+
30
+ # Load coco image features
31
+ coco_image_features = torch.load("image_embeddings_best.pt", map_location=CFG.device)
32
+ coco_image_features = coco_image_features[:3000]
33
+
34
+ def text_image(query):
35
+ model.text_image_encoder.image_retrieval(query,
36
+ image_paths=coco_images,
37
+ image_embeddings=coco_image_features,
38
+ n=9,
39
+ plot=True,
40
+ temperature=0.0
41
+ )
42
+ return "img.png"
43
+
44
+ def audio_image(query):
45
+ # Load the audio with torchaudio (returns tensor and sample rate)
46
+ waveform, sample_rate = torchaudio.load(query)
47
+
48
+ # Check if audio is stereo
49
+ if waveform.shape[0] > 1: # Stereo (2 channels)
50
+ # Convert stereo to mono: sum the left and right channels and divide by 2
51
+ mono_audio = waveform.mean(dim=0, keepdim=True)
52
+ else:
53
+ # Audio is already mono
54
+ mono_audio = waveform
55
+
56
+ # Resample to 16000 Hz if not already
57
+ if sample_rate != 16000:
58
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
59
+ mono_audio = resampler(mono_audio)
60
+
61
+ # Convert to numpy array for pipeline processing (if required)
62
+ mono_audio = mono_audio.squeeze(0).numpy()
63
+
64
+ audio_encoding = model.process_audio([mono_audio])
65
+
66
+ model.image_retrieval(audio_encoding,
67
+ image_paths=coco_images,
68
+ image_embeddings=coco_image_features,
69
+ n=9,
70
+ plot=True,
71
+ temperature=0.0,
72
+ display_audio=False)
73
+
74
+ return "img.png"
75
+
76
+
77
+ # Updated Gradio Interface
78
+ iface = gr.TabbedInterface(
79
+ [
80
+ gr.Interface(
81
+ fn=text_image,
82
+ inputs=gr.Textbox(label="Text Query"),
83
+ outputs="image",
84
+ title="Retrieve images using text as query",
85
+ description="Implementation of OneEncoder using one layer on UP for light demo, Only coco train dataset is used in this example (3000 images)."
86
+ ),
87
+ gr.Interface(
88
+ fn=audio_image,
89
+ inputs=gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio Query"),
90
+ outputs="image",
91
+ title="Retrieve images using audio as query",
92
+ description="Implementation of OneEncoder using one layer on UP for light demo, Only coco train dataset is used in this example (3000 images)."
93
+ )
94
+ ],
95
+ tab_names=["Text - Image", "Audio - Image"]
96
+ )
97
+
98
+ iface.launch(debug=True, share=True)
configs.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+
4
+ ################################################### PARMETERS ##########################################################
5
+ ################################################# PARAMETERS ###########################################################
6
+
7
+ class CFG:
8
+ max_length = 128
9
+ batch_size = 32
10
+ num_workers = 4
11
+ projection_dim = 768
12
+ dropout_rate = 0.1
13
+ num_head = 4
14
+ num_layers = 1
15
+ image_encoder_lr = 1e-4
16
+ radio_encoder_lr = 1e-5
17
+ video_encoder_lr = 1e-4
18
+ text_encoder_lr = 1e-5
19
+ audio_encoder_lr = 1e-5
20
+ modality_token_encoder_lr = 1e-3
21
+ universal_projection_lr = 1e-3
22
+ lr = 1e-3
23
+ weight_decay = 1e-3
24
+ patience = 10
25
+ factor = 0.8
26
+ token_size = 1
27
+ epochs = 100
28
+ image_size = 224
29
+ device = "cpu"
30
+ data_directory = "datasets"
31
+ train_annotation_file = os.path.join(data_directory, "annotations", "captions_train2014.json")
32
+ val_annotation_file = os.path.join(data_directory, "annotations", "captions_val2014.json")
33
+ image_dir = os.path.join(data_directory, "train2014")
34
+ image_dir_val = os.path.join(data_directory, "val2014")
35
+ bert_name = "bert-base-uncased"
36
+ vit_name = "vit_base_patch16_224"
37
+ audio_name = "facebook/wav2vec2-base-960h"
38
+ radio_name = "microsoft/rad-dino"
39
+ video_name = "MCG-NJU/videomae-base"
40
+ sample_rate = 16000
costum_datasets.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from configs import CFG
2
+ import os
3
+ import requests
4
+ import zipfile
5
+ from pycocotools.coco import COCO
6
+ import torch
7
+ import cv2
8
+ import albumentations as A
9
+ import soundfile as sf
10
+
11
+
12
+
13
+ # Load Coco dataset
14
+ def download_dataset(data_dir="../datasets"):
15
+ # Create caption and image directories
16
+ annotations_dir = os.path.join(data_dir, "annotations")
17
+ images_dir = os.path.join(data_dir, "train2014")
18
+
19
+ # Download annotations (captions)
20
+ zip_file = os.path.join(annotations_dir, "annotations.zip")
21
+ url = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
22
+ response = requests.get(url, stream=True)
23
+ # write chunk in zip file
24
+ with open(zip_file, "wb") as f:
25
+ # 8192 = 8KB chunks (block or piece of data)
26
+ for chunk in response.iter_content(chunk_size=8192):
27
+ f.write(chunk)
28
+ # unzip file
29
+ with zipfile.ZipFile(zip_file, "r") as zip_ref:
30
+ zip_ref.extractall(data_dir) # Extract all contents to the specified directory
31
+ os.remove(zip_file)
32
+
33
+ # Download train images
34
+ zip_file = os.path.join(images_dir, "train2014.zip")
35
+ url = "http://images.cocodataset.org/zips/train2014.zip"
36
+ response = requests.get(url, stream=True)
37
+ # write chunk in zip file
38
+ with open(zip_file, "wb") as f:
39
+ for chunk in response.iter_content(chunk_size=8192):
40
+ f.write(chunk)
41
+ # unzip file
42
+ with zipfile.ZipFile(zip_file, "r") as zip_ref:
43
+ zip_ref.extractall(data_dir) # Extract all contents to the specified directory
44
+ os.remove(zip_file)
45
+
46
+ # Download val images
47
+ images_dir = os.path.join(data_dir, "val2014")
48
+ zip_file = os.path.join(images_dir, "val2014.zip")
49
+ url = "http://images.cocodataset.org/zips/val2014.zip"
50
+ response = requests.get(url, stream=True)
51
+ # write chunk in zip file
52
+ with open(zip_file, "wb") as f:
53
+ for chunk in response.iter_content(chunk_size=8192):
54
+ f.write(chunk)
55
+ # unzip file
56
+ with zipfile.ZipFile(zip_file, "r") as zip_ref:
57
+ zip_ref.extractall(data_dir) # Extract all contents to the specified directory
58
+ os.remove(zip_file)
59
+
60
+
61
+
62
+ def make_pairs(annotation_json_files, image_dir, max_captions=3):
63
+
64
+ images = os.listdir(annotation_json_files)
65
+ image_caption = [(os.path.join(annotation_json_files, image), "an image") for image in images]
66
+ return image_caption
67
+
datasets/train2014/COCO_train2014_000000000009.jpg ADDED
datasets/train2014/COCO_train2014_000000000025.jpg ADDED
datasets/train2014/COCO_train2014_000000000030.jpg ADDED
datasets/train2014/COCO_train2014_000000000034.jpg ADDED
datasets/train2014/COCO_train2014_000000000036.jpg ADDED
datasets/train2014/COCO_train2014_000000000049.jpg ADDED
datasets/train2014/COCO_train2014_000000000061.jpg ADDED
datasets/train2014/COCO_train2014_000000000064.jpg ADDED
datasets/train2014/COCO_train2014_000000000071.jpg ADDED
datasets/train2014/COCO_train2014_000000000072.jpg ADDED
datasets/train2014/COCO_train2014_000000000077.jpg ADDED
datasets/train2014/COCO_train2014_000000000078.jpg ADDED
datasets/train2014/COCO_train2014_000000000081.jpg ADDED
datasets/train2014/COCO_train2014_000000000086.jpg ADDED
datasets/train2014/COCO_train2014_000000000089.jpg ADDED
datasets/train2014/COCO_train2014_000000000092.jpg ADDED
datasets/train2014/COCO_train2014_000000000094.jpg ADDED
datasets/train2014/COCO_train2014_000000000109.jpg ADDED
datasets/train2014/COCO_train2014_000000000110.jpg ADDED
datasets/train2014/COCO_train2014_000000000113.jpg ADDED
datasets/train2014/COCO_train2014_000000000127.jpg ADDED
datasets/train2014/COCO_train2014_000000000138.jpg ADDED
datasets/train2014/COCO_train2014_000000000142.jpg ADDED
datasets/train2014/COCO_train2014_000000000144.jpg ADDED
datasets/train2014/COCO_train2014_000000000149.jpg ADDED
datasets/train2014/COCO_train2014_000000000151.jpg ADDED
datasets/train2014/COCO_train2014_000000000154.jpg ADDED
datasets/train2014/COCO_train2014_000000000165.jpg ADDED
datasets/train2014/COCO_train2014_000000000194.jpg ADDED
datasets/train2014/COCO_train2014_000000000201.jpg ADDED
datasets/train2014/COCO_train2014_000000000247.jpg ADDED
datasets/train2014/COCO_train2014_000000000250.jpg ADDED
datasets/train2014/COCO_train2014_000000000260.jpg ADDED
datasets/train2014/COCO_train2014_000000000263.jpg ADDED
datasets/train2014/COCO_train2014_000000000307.jpg ADDED
datasets/train2014/COCO_train2014_000000000308.jpg ADDED
datasets/train2014/COCO_train2014_000000000309.jpg ADDED
datasets/train2014/COCO_train2014_000000000312.jpg ADDED
datasets/train2014/COCO_train2014_000000000315.jpg ADDED
datasets/train2014/COCO_train2014_000000000321.jpg ADDED
datasets/train2014/COCO_train2014_000000000322.jpg ADDED
datasets/train2014/COCO_train2014_000000000326.jpg ADDED
datasets/train2014/COCO_train2014_000000000332.jpg ADDED
datasets/train2014/COCO_train2014_000000000349.jpg ADDED
datasets/train2014/COCO_train2014_000000000368.jpg ADDED
datasets/train2014/COCO_train2014_000000000370.jpg ADDED
datasets/train2014/COCO_train2014_000000000382.jpg ADDED