Please provide working Python script to run this model

by JLouisBiz - opened 5 days ago

5 days ago

Please provide working Python script to run this model as expected

I get this problem:

python rcd-test.py 
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
[2025-04-21 00:44:55,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
No ROCm runtime is found, using ROCM_HOME='/usr'
!!!!!!!!!!!!megablocks not available, using torch.matmul instead
Traceback (most recent call last):
  File "/mnt/data/LLM/nomic-ai/nomic-embed-vision-v1.5/rcd-test.py", line 31, in <module>
    text_tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/data1/protected/venv/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 1038, in from_pretrained
    raise ValueError(
ValueError: Unrecognized configuration class <class 'transformers_modules.nomic-ai.nomic-bert-2048.e5042dce39060cc34bc223455f25cf1d26db4655.configuration_hf_nomic_bert.NomicBertConfig'> to build an AutoTokenizer.
Model type should be one of AlbertConfig, AlignConfig, AriaConfig, AyaVisionConfig, BarkConfig, BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BlipConfig, Blip2Config, BloomConfig, BridgeTowerConfig, BrosConfig, CamembertConfig, CanineConfig, ChameleonConfig, ChineseCLIPConfig, ClapConfig, CLIPConfig, CLIPSegConfig, ClvpConfig, LlamaConfig, CodeGenConfig, CohereConfig, Cohere2Config, ColPaliConfig, ConvBertConfig, CpmAntConfig, CTRLConfig, Data2VecAudioConfig, Data2VecTextConfig, DbrxConfig, DebertaConfig, DebertaV2Config, DeepseekV3Config, DiffLlamaConfig, DistilBertConfig, DPRConfig, ElectraConfig, Emu3Config, ErnieConfig, ErnieMConfig, EsmConfig, FalconConfig, FalconMambaConfig, FastSpeech2ConformerConfig, FlaubertConfig, FNetConfig, FSMTConfig, FunnelConfig, GemmaConfig, Gemma2Config, Gemma3Config, Gemma3TextConfig, GitConfig, GlmConfig, Glm4Config, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, GPTSanJapaneseConfig, GroundingDinoConfig, GroupViTConfig, HeliumConfig, HubertConfig, IBertConfig, IdeficsConfig, Idefics2Config, Idefics3Config, InstructBlipConfig, InstructBlipVideoConfig, JambaConfig, JetMoeConfig, JukeboxConfig, Kosmos2Config, LayoutLMConfig, LayoutLMv2Config, LayoutLMv3Config, LEDConfig, LiltConfig, LlamaConfig, Llama4Config, Llama4TextConfig, LlavaConfig, LlavaNextConfig, LlavaNextVideoConfig, LlavaOnevisionConfig, LongformerConfig, LongT5Config, LukeConfig, LxmertConfig, M2M100Config, MambaConfig, Mamba2Config, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MgpstrConfig, MistralConfig, MixtralConfig, MllamaConfig, MobileBertConfig, ModernBertConfig, MoonshineConfig, MoshiConfig, MPNetConfig, MptConfig, MraConfig, MT5Config, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, NemotronConfig, NezhaConfig, NllbMoeConfig, NystromformerConfig, OlmoConfig, Olmo2Config, OlmoeConfig, OmDetTurboConfig, OneFormerConfig, OpenAIGPTConfig, OPTConfig, Owlv2Config, OwlViTConfig, PaliGemmaConfig, PegasusConfig, PegasusXConfig, PerceiverConfig, PersimmonConfig, PhiConfig, Phi3Config, PhimoeConfig, Pix2StructConfig, PixtralVisionConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2_5_VLConfig, Qwen2AudioConfig, Qwen2MoeConfig, Qwen2VLConfig, Qwen3Config, Qwen3MoeConfig, RagConfig, RealmConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RetriBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, ShieldGemma2Config, SiglipConfig, Siglip2Config, Speech2TextConfig, Speech2Text2Config, SpeechT5Config, SplinterConfig, SqueezeBertConfig, StableLmConfig, Starcoder2Config, SwitchTransformersConfig, T5Config, TapasConfig, TransfoXLConfig, TvpConfig, UdopConfig, UMT5Config, VideoLlavaConfig, ViltConfig, VipLlavaConfig, VisualBertConfig, VitsConfig, Wav2Vec2Config, Wav2Vec2BertConfig, Wav2Vec2ConformerConfig, WhisperConfig, XCLIPConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, YosoConfig, ZambaConfig, Zamba2Config.
(venv) lco@rtx:/mnt/data/LLM/nomic-ai/nomic-embed-vision-v1.5$

with the following script

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
from PIL import Image
import requests

# Local model paths
MODEL_DIR = "/mnt/data/LLM/nomic-ai/nomic-embed-vision-v1.5"

# Initialize models from local directory
processor = AutoImageProcessor.from_pretrained(MODEL_DIR)
vision_model = AutoModel.from_pretrained(MODEL_DIR, trust_remote_code=True)

# Process image
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(image, return_tensors="pt")

# Get image embeddings
with torch.no_grad():
    img_emb = vision_model(**inputs).last_hidden_state
img_embeddings = F.normalize(img_emb[:, 0], p=2, dim=1)

# Text processing function
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Initialize text model from local directory
text_tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
text_model = AutoModel.from_pretrained(MODEL_DIR, trust_remote_code=True)
text_model.eval()

# Process text
sentences = ['search_query: What are cute animals to cuddle with?', 'search_query: What do cats look like?']
encoded_input = text_tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    text_output = text_model(**encoded_input)

text_embeddings = mean_pooling(text_output, encoded_input['attention_mask'])
text_embeddings = F.layer_norm(text_embeddings, normalized_shape=(text_embeddings.shape[1],))
text_embeddings = F.normalize(text_embeddings, p=2, dim=1)

# Calculate similarity
print("Similarity scores:")
print(torch.matmul(img_embeddings, text_embeddings.T))

zpn

Nomic AI org 5 days ago

You also need to download nomic-embed-text-v1. From the script in the readme

tokenizer = AutoTokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1.5')
text_model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True)

whereas you're loading a tokenizer (which doesn't exist) in the vision model as this model only includes the vision model

zpn changed discussion status to closed 5 days ago

JLouisBiz

4 days ago

Thank you for so much for jumping in and trying to help. I have found this script working now well and it is a fantastic work very useful. especially useful is that it is sharing the same vector space as the text model from the same company and that means when I get embeddings for images I can search for images by using text. It works pretty well better than tagging.

https://www.youtube.com/watch?v=AN-iZblyZNE

Because it shares same vector space I can search by image embeddings by text queries!

I find that your website is informative and how you have been pushing embeddings as knowledge to public. It helped me understand it better. I wish you good success in business and I'm now in love with these models. They are always running in my computer. In fact, they can also run on the CPU. They don't necessarily need GPU and they are excellent. I can't live without them. This is now part of my life.

from fastapi import FastAPI, UploadFile, File, HTTPException, Form
from fastapi.responses import JSONResponse
from contextlib import asynccontextmanager
import torch
from PIL import Image
from transformers import AutoModel, AutoProcessor
import io
import numpy as np
import os
import requests

# Configuration
MODEL_PATH = "/mnt/data/LLM/nomic-ai/nomic-embed-vision-v1.5"
HOST = "192.168.1.68"
PORT = 9998

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

@asynccontextmanager
async def lifespan(app: FastAPI):
    if os.path.exists(MODEL_PATH):
        print("📦 Loading model from local directory")
        app.state.processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
        app.state.model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(device)
    else:
        print("🌐 Downloading model from Hugging Face")
        app.state.processor = AutoProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
        app.state.model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True).to(device)
        os.makedirs(MODEL_PATH, exist_ok=True)
        app.state.model.save_pretrained(MODEL_PATH)
        app.state.processor.save_pretrained(MODEL_PATH)
    
    app.state.model.eval()
    yield
    del app.state.model
    del app.state.processor

app = FastAPI(lifespan=lifespan)

async def process_image(image_source):
    try:
        if isinstance(image_source, str):  # URL case
            response = requests.get(image_source, stream=True)
            response.raise_for_status()
            image = Image.open(io.BytesIO(response.content)).convert("RGB")
        else:  # UploadFile case
            contents = await image_source.read()
            image = Image.open(io.BytesIO(contents)).convert("RGB")
        
        inputs = app.state.processor(images=image, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = app.state.model(**inputs)
            if hasattr(outputs, 'image_embeds'):
                embeddings = outputs.image_embeds[0]
            elif hasattr(outputs, 'last_hidden_state'):
                embeddings = outputs.last_hidden_state.mean(dim=1)[0]
            else:
                embeddings = outputs[0][:, 0][0]
        
        embedding = embeddings.cpu().float().numpy()
        embedding = embedding / np.linalg.norm(embedding)
        return embedding
        
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Image processing failed: {str(e)}")



@app
	.post("/embed/image")
async def embed_image(
    file: UploadFile = File(None),
    image_url: str = Form(None)
):
    if not file and not image_url:
        raise HTTPException(status_code=400, detail="Provide either file or image_url")
    if file and image_url:
        raise HTTPException(status_code=400, detail="Provide only one of file or image_url")
    
    try:
        embedding = await process_image(file if file else image_url)
        return JSONResponse({
            "embedding": embedding.tolist(),
            "dimension": len(embedding),
            "model": "nomic-embed-vision-v1.5",
            "device": str(device),
            "source": "file" if file else "url"
        })
    except HTTPException as e:
        raise e
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host=HOST, port=PORT)

# # 2025-04-21 works well
# from fastapi import FastAPI, UploadFile, File, HTTPException, Form
# from fastapi.responses import JSONResponse
# from contextlib import asynccontextmanager
# import torch
# from PIL import Image
# from transformers import AutoModel, AutoProcessor
# import io
# import numpy as np
# import os

# # YOUR EXISTING CONFIGURATION (UNCHANGED)
# MODEL_PATH = "/mnt/data/LLM/nomic-ai/nomic-embed-vision-v1.5"
# HOST = "192.168.1.68"
# PORT = 9998

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# @asynccontextmanager
# async def lifespan(app: FastAPI):
#     # YOUR ORIGINAL MODEL LOADING (100% UNCHANGED)
#     if os.path.exists(MODEL_PATH):
#         print("📦 Loading model from local directory")
#         app.state.processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
#         app.state.model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(device)
#     else:
#         print("🌐 Downloading model from Hugging Face")
#         app.state.processor = AutoProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
#         app.state.model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True).to(device)
#         os.makedirs(MODEL_PATH, exist_ok=True)
#         app.state.model.save_pretrained(MODEL_PATH)
#         app.state.processor.save_pretrained(MODEL_PATH)
    
#     app.state.model.eval()
#     yield
#     # Cleanup
#     del app.state.model
#     del app.state.processor

# app = FastAPI(lifespan=lifespan)

# # YOUR ORIGINAL IMAGE PROCESSING (UNCHANGED)
# async def process_image(file: UploadFile):
#     try:
#         contents = await file.read()
#         image = Image.open(io.BytesIO(contents)).convert("RGB")
#         inputs = app.state.processor(images=image, return_tensors="pt").to(device)
        
#         with torch.no_grad():
#             outputs = app.state.model(**inputs)
#             if hasattr(outputs, 'image_embeds'):
#                 embeddings = outputs.image_embeds[0]
#             elif hasattr(outputs, 'last_hidden_state'):
#                 embeddings = outputs.last_hidden_state.mean(dim=1)[0]
#             else:
#                 embeddings = outputs[0][:, 0][0]
        
#         embedding = embeddings.cpu().float().numpy()
#         embedding = embedding / np.linalg.norm(embedding)
#         return embedding
        
#     except Exception as e:
#         raise HTTPException(status_code=400, detail=f"Image processing failed: {str(e)}")

# # YOUR ORIGINAL IMAGE ENDPOINT (UNCHANGED)
# 

@app
	.post("/embed/image")
# async def embed_image(file: UploadFile = File(...)):
#     embedding = await process_image(file)
#     return JSONResponse({
#         "embedding": embedding.tolist(),
#         "dimension": len(embedding),
#         "model": "nomic-embed-vision-v1.5",
#         "device": str(device)
#     })

# # NEW: TEXT PROCESSING USING SAME MODEL (MINIMAL ADDITION)
# 

@app
	.post("/embed/text")
# async def embed_text(text: str = Form(...)):
#     """Simple text embedding using the vision model's text processor"""
#     try:
#         # Uses the same processor that came with your vision model
#         inputs = app.state.processor(text=[text], return_tensors="pt", padding=True, truncation=True).to(device)
        
#         with torch.no_grad():
#             outputs = app.state.model(**inputs)
#             if hasattr(outputs, 'text_embeds'):
#                 embeddings = outputs.text_embeds[0]
#             else:
#                 # Fallback to using image features for text
#                 embeddings = outputs.last_hidden_state.mean(dim=1)[0]
        
#         embedding = embeddings.cpu().float().numpy()
#         embedding = embedding / np.linalg.norm(embedding)
#         return JSONResponse({
#             "embedding": embedding.tolist(),
#             "dimension": len(embedding),
#             "model": "nomic-embed-vision-v1.5",
#             "note": "Text processed using vision model's capabilities"
#         })
#     except Exception as e:
#         raise HTTPException(status_code=400, detail=str(e))

# if __name__ == "__main__":
#     import uvicorn
#     uvicorn.run(app, host=HOST, port=PORT)
# --------------------------------------------
# # 2025-04-21 works well on CUDA
# from fastapi import FastAPI, UploadFile, File, HTTPException
# from fastapi.responses import JSONResponse
# from contextlib import asynccontextmanager
# import torch
# from PIL import Image
# from transformers import AutoModel, AutoProcessor
# import io
# import numpy as np
# import os

# # Configuration
# MODEL_PATH = "/mnt/data/LLM/nomic-ai/nomic-embed-vision-v1.5"
# HOST = "192.168.1.68"
# PORT = 9998

# # [NEW] Device detection
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Model loading with lifespan management
# @asynccontextmanager
# async def lifespan(app: FastAPI):
#     # Load model and processor
#     if os.path.exists(MODEL_PATH):
#         print("📦 Loading model from local directory")
#         app.state.processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
#         app.state.model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(device)  # [MODIFIED]
#     else:
#         print("🌐 Downloading model from Hugging Face")
#         app.state.processor = AutoProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
#         app.state.model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True).to(device)  # [MODIFIED]
#         os.makedirs(MODEL_PATH, exist_ok=True)
#         app.state.model.save_pretrained(MODEL_PATH)
#         app.state.processor.save_pretrained(MODEL_PATH)
    
#     app.state.model.eval()
#     yield
#     # Cleanup
#     del app.state.model
#     del app.state.processor

# app = FastAPI(lifespan=lifespan)

# async def process_image(file: UploadFile):
#     try:
#         # Read and preprocess image
#         contents = await file.read()
#         image = Image.open(io.BytesIO(contents)).convert("RGB")
#         inputs = app.state.processor(images=image, return_tensors="pt").to(device)  # [MODIFIED]
        
#         # Get embeddings
#         with torch.no_grad():
#             outputs = app.state.model(**inputs)
            
#             if hasattr(outputs, 'image_embeds'):
#                 embeddings = outputs.image_embeds[0]
#             elif hasattr(outputs, 'last_hidden_state'):
#                 embeddings = outputs.last_hidden_state.mean(dim=1)[0]
#             else:
#                 embeddings = outputs[0][:, 0][0]
        
#         # Convert to numpy and normalize
#         embedding = embeddings.cpu().float().numpy()  # [MODIFIED] (added .cpu())
#         embedding = embedding / np.linalg.norm(embedding)
        
#         return embedding
        
#     except Exception as e:
#         raise HTTPException(status_code=400, detail=f"Image processing failed: {str(e)}")

# 

@app
	.post("/embed")
# async def embed_image(file: UploadFile = File(...)):
#     """Endpoint for single image embedding"""
#     embedding = await process_image(file)
#     return JSONResponse({
#         "embedding": embedding.tolist(),
#         "dimension": len(embedding),
#         "model": "nomic-embed-vision-v1.5",
#         "device": str(device)  # [NEW] Show which device was used
#     })

# if __name__ == "__main__":
#     import uvicorn
#     uvicorn.run(app, host=HOST, port=PORT)
# ----------------------------------------------------
# runs WELL ON CPU: 2025-04-21
# from fastapi import FastAPI, UploadFile, File, HTTPException
# from fastapi.responses import JSONResponse
# from contextlib import asynccontextmanager
# import torch
# from PIL import Image
# from transformers import AutoModel, AutoProcessor
# import io
# import numpy as np
# import os

# # Configuration
# MODEL_PATH = "/mnt/data/LLM/nomic-ai/nomic-embed-vision-v1.5"
# HOST = "192.168.1.68"
# PORT = 9998

# # Model loading with lifespan management
# @asynccontextmanager
# async def lifespan(app: FastAPI):
#     # Load model and processor
#     if os.path.exists(MODEL_PATH):
#         print("📦 Loading model from local directory")
#         app.state.processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
#         app.state.model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True)
#     else:
#         print("🌐 Downloading model from Hugging Face")
#         app.state.processor = AutoProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
#         app.state.model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
#         os.makedirs(MODEL_PATH, exist_ok=True)
#         app.state.model.save_pretrained(MODEL_PATH)
#         app.state.processor.save_pretrained(MODEL_PATH)
    
#     app.state.model.eval()
#     yield
#     # Cleanup
#     del app.state.model
#     del app.state.processor

# app = FastAPI(lifespan=lifespan)

# async def process_image(file: UploadFile):
#     try:
#         # Read and preprocess image
#         contents = await file.read()
#         image = Image.open(io.BytesIO(contents)).convert("RGB")
#         inputs = app.state.processor(images=image, return_tensors="pt")
        
#         # Get embeddings
#         with torch.no_grad():
#             outputs = app.state.model(**inputs)
            
#             if hasattr(outputs, 'image_embeds'):
#                 embeddings = outputs.image_embeds[0]
#             elif hasattr(outputs, 'last_hidden_state'):
#                 embeddings = outputs.last_hidden_state.mean(dim=1)[0]
#             else:
#                 embeddings = outputs[0][:, 0][0]
        
#         # Convert to numpy and normalize
#         embedding = embeddings.float().numpy()
#         embedding = embedding / np.linalg.norm(embedding)
        
#         return embedding
        
#     except Exception as e:
#         raise HTTPException(status_code=400, detail=f"Image processing failed: {str(e)}")

# 

@app
	.post("/embed")
# async def embed_image(file: UploadFile = File(...)):
#     """Endpoint for single image embedding"""
#     embedding = await process_image(file)
#     return JSONResponse({
#         "embedding": embedding.tolist(),
#         "dimension": len(embedding),
#         "model": "nomic-embed-vision-v1.5"
#     })

# 

@app
	.get("/model-info")
# async def model_info():
#     """Return model metadata"""
#     dummy_input = torch.rand(1, 3, 224, 224)
#     with torch.no_grad():
#         output = app.state.model(dummy_input)
#         dim = output.last_hidden_state.shape[-1] if hasattr(output, 'last_hidden_state') else len(output[0][0])
    
#     return {
#         "model": "nomic-embed-vision-v1.5",
#         "embedding_dimension": dim,
#         "normalized": True,
#         "endpoint": f"http://{HOST}:{PORT}/embed"
#     }

# if __name__ == "__main__":
#     import uvicorn
#     uvicorn.run(app, host=HOST, port=PORT)

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

Your need to confirm your account before you can post a new comment.

· Sign up or log in to comment