electroglyph
/

snowflake2_m_uint8

@@ -1,292 +0,0 @@
-import json
-import onnxruntime as rt
-import transformers
-from qdrant_client import QdrantClient, models
-import queue
-from threading import Thread, Lock
-import time
-from pyatomix import AtomicInt
-# adjust these settings as needed
-TOKENIZER_PATH = "."
-ORIG_MODEL_PATH = "model_uint8.onnx"
-ORIG_DATATYPE = models.Datatype.FLOAT32
-ORIG_COLLECTION_NAME = "baseline"
-COMPARE_MODEL_PATH = "snowflake2_m_uint8.onnx"
-COMPARE_DATATYPE = models.Datatype.UINT8
-COMPARE_COLLECTION_NAME = "compare"
-EMBEDDING_DIM = 768  # size of the model output
-STAT_RANGES = [
-    10,
-    20,
-    50,
-]  # stats will be calculated for each range: top 10, top 20, etc.
-STATS = {}
-STAT_LOCK = Lock()
-BATCH_SIZE = 1000  # this many token/id pairs will be processed at a time
-THREADS = 8  # number of threads to use
-# Qdrant client settings here
-CLIENT_URL = "http://127.0.0.1"
-CLIENT_PORT = 6333
-CLIENT_GRPC_PORT = 6334
-CLIENT_USE_GRPC = True
-FINISHED = AtomicInt(0)
-def collect_tokens() -> list[str] | None:
-    print("Attempting to grab tokens from tokenizer...")
-    with open(f"{TOKENIZER_PATH}/tokenizer.json", "r") as f:
-        t = f.read()
-        j = json.loads(t)
-        v = j["model"]["vocab"]
-        toks = [x[0] for x in v]
-        print(f"Found {len(toks)} tokens.")
-        return toks
-def init_worker(q: queue.Queue, model_path: str, collection_name: str):
-    try:
-        session = rt.InferenceSession(model_path, providers=["CPUExecutionProvider"])
-    except Exception as e:
-        print(f"Error loading ONNX model: {e}")
-        return
-    tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
-    client = QdrantClient(
-        url=CLIENT_URL,
-        port=CLIENT_PORT,
-        grpc_port=CLIENT_GRPC_PORT,
-        prefer_grpc=CLIENT_USE_GRPC,
-    )
-    global FINISHED
-    while True:
-        try:
-            chunk = q.get(False)
-        except queue.Empty:
-            return
-        batch = []
-        for c in chunk:
-            FINISHED += 1
-            # c[0] == id, c[1] == token, we want id to always be associated with the same token across models
-            enc = tokenizer(c[1])  # this could've been batched...
-            embeddings = session.run(
-                None,
-                {
-                    "input_ids": [enc.input_ids],
-                    "attention_mask": [enc.attention_mask],
-                },
-            )
-            batch.append(  # [1][0] == sentence_embedding
-                models.PointStruct(id=c[0], vector={"dense": embeddings[1][0]})
-            )
-        client.batch_update_points(
-            collection_name=collection_name,
-            update_operations=[models.UpsertOperation(upsert=models.PointsList(points=batch))],
-            wait=False,
-        )
-def init_collection(collection_name: str, model_path: str, datatype: models.Datatype) -> bool:
-    client = QdrantClient(
-        url=CLIENT_URL,
-        port=CLIENT_PORT,
-        grpc_port=CLIENT_GRPC_PORT,
-        prefer_grpc=CLIENT_USE_GRPC,
-    )
-    if client.collection_exists(collection_name):
-        info = client.get_collection(collection_name)
-        print(f"Collection '{collection_name}' already exists, skipping init.")
-        print(f"{info.points_count} points in collection.")
-        return True
-    res = client.create_collection(
-        collection_name=collection_name,
-        vectors_config={
-            "dense": models.VectorParams(
-                size=EMBEDDING_DIM,
-                distance=models.Distance.COSINE,
-                on_disk=False,
-                datatype=datatype,
-            ),
-        },
-        hnsw_config=models.HnswConfigDiff(m=0),  # no index
-        on_disk_payload=False,
-    )
-    if not res:
-        print(f"Error creating collection.")
-        return False
-    else:
-        print("Collection created.")
-    toks = collect_tokens()
-    FINISHED.store(0)
-    if toks:
-        ids = [x for x in range(len(toks))]
-        # align Qdrant IDs with the token for later analysis
-        pairs = list(zip(ids, toks))
-        # lists of (Qdrant ID, token)
-        chunks = [pairs[i : i + BATCH_SIZE] for i in range(0, len(pairs), BATCH_SIZE)]
-        q = queue.Queue()
-        for c in chunks:
-            q.put(c)
-        for _ in range(THREADS):
-            t = Thread(target=init_worker, args=[q, model_path, collection_name])
-            t.start()
-        count = 0
-        while FINISHED.load() < len(toks):
-            time.sleep(0.5)
-            count += 1
-            if count == 20:  # update every 10 seconds or so
-                print(f"approximately {q.qsize() * BATCH_SIZE} items left in queue...")
-                count = 0
-        print(f"Done with collection init, {len(toks)} tokens upserted.")
-        # enable indexing
-        client.update_collection(collection_name=collection_name, hnsw_config=models.HnswConfigDiff(m=16))
-        return True
-    else:
-        print("Failed to grab tokens from tokenizer.")
-        return False
-def count_mismatches(list1, list2) -> int:
-    count = 0
-    assert len(list1) == len(list2)
-    for i in range(len(list1)):
-        if list1[i] != list2[i]:
-            count += 1
-    return count
-def score_results(
-    list1: list,
-    list2: list,
-):
-    assert len(list1) == len(list2)
-    global STATS
-    for x in STAT_RANGES:
-        with STAT_LOCK:
-            # STATS = { range, {"exact": AtomicInt, ... }}
-            d = STATS.get(x)
-            if d is None:
-                d = {
-                    "exact": AtomicInt(0),
-                    "off_by_1": AtomicInt(0),
-                    "off_by_2": AtomicInt(0),
-                    "off_by_3": AtomicInt(0),
-                    "off_by_4": AtomicInt(0),
-                    "off_by_5": AtomicInt(0),
-                    "missing": AtomicInt(0),
-                }
-                STATS[x] = d
-        for i in range(x):
-            if list1[i] == list2[i]:
-                d["exact"] += 1
-            else:
-                if list1[i] in list2:
-                    i2 = list2.index(list1[i])
-                    val = abs(i2 - i)
-                    if val == 1:
-                        d["off_by_1"] += 1
-                    elif val == 2:
-                        d["off_by_2"] += 1
-                    elif val == 3:
-                        d["off_by_3"] += 1
-                    elif val == 4:
-                        d["off_by_4"] += 1
-                    else:
-                        d["off_by_5"] += 1
-                else:
-                    d["missing"] += 1
-def main_worker(q: queue.Queue, limit: int):
-    global FINISHED
-    tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
-    orig_session = rt.InferenceSession(ORIG_MODEL_PATH, providers=["CPUExecutionProvider"])
-    compare_session = rt.InferenceSession(COMPARE_MODEL_PATH, providers=["CPUExecutionProvider"])
-    client = QdrantClient(
-        url=CLIENT_URL,
-        port=CLIENT_PORT,
-        grpc_port=CLIENT_GRPC_PORT,
-        prefer_grpc=CLIENT_USE_GRPC,
-    )
-    while True:
-        try:
-            chunk = q.get(False)
-        except queue.Empty:
-            return
-        # c[0] == id, c[1] == token, we want id to always be associated with the same token across models
-        for c in chunk:
-            enc = tokenizer(c)
-            oe = orig_session.run(
-                None,
-                {"input_ids": [enc.input_ids], "attention_mask": [enc.attention_mask]},
-            )
-            ce = compare_session.run(
-                None,
-                {"input_ids": [enc.input_ids], "attention_mask": [enc.attention_mask]},
-            )
-            oresult = client.query_points(
-                collection_name=ORIG_COLLECTION_NAME,
-                using="dense",
-                query=oe[1][0],
-                limit=limit + 5,  # for our scoring metric we want to look slightly past the end
-            )
-            cresult = client.query_points(
-                collection_name=COMPARE_COLLECTION_NAME,
-                using="dense",
-                query=ce[1][0],
-                limit=limit + 5,
-            )
-            oids = [p.id for p in oresult.points]
-            cids = [p.id for p in cresult.points]
-            score_results(
-                oids,
-                cids,
-            )
-            FINISHED += 1
-def main():
-    if not init_collection(ORIG_COLLECTION_NAME, ORIG_MODEL_PATH, ORIG_DATATYPE):
-        print("Failed to initialize original model values, exiting.")
-        return
-    if not init_collection(COMPARE_COLLECTION_NAME, COMPARE_MODEL_PATH, COMPARE_DATATYPE):
-        print("Failed to initialize secondary model values, exiting.")
-        return
-    toks = collect_tokens()
-    limit = 0
-    for x in STAT_RANGES:
-        if x > limit:
-            limit = x
-    FINISHED.store(0)
-    if toks:
-        chunks = [toks[i : i + BATCH_SIZE] for i in range(0, len(toks), BATCH_SIZE)]
-        q = queue.Queue()
-        for c in chunks:
-            q.put(c)
-        print("Starting analysis.")
-        for _ in range(THREADS):
-            t = Thread(
-                target=main_worker,
-                args=[q, limit],
-            )
-            t.start()
-        count = 0
-        while FINISHED.load() < len(toks):
-            time.sleep(0.5)
-            count += 1
-            if count == 20:  # update every 10 seconds or so
-                print(f"approximately {q.qsize() * BATCH_SIZE} items left in queue...")
-                count = 0
-        print(f"Done with analysis.")
-        with STAT_LOCK:
-            for k, v in STATS.items():
-                print(f"Stats for top {k} query results across entire token range:")
-                print(f"exact    : {(float(v["exact"].load()) / (len(toks) * k)) * 100:.2f}%")
-                print(f"off by 1 : {(float(v["off_by_1"].load()) / (len(toks) * k)) * 100:.2f}%")
-                print(f"off by 2 : {(float(v["off_by_2"].load()) / (len(toks) * k)) * 100:.2f}%")
-                print(f"off by 3 : {(float(v["off_by_3"].load()) / (len(toks) * k)) * 100:.2f}%")
-                print(f"off by 4 : {(float(v["off_by_4"].load()) / (len(toks) * k)) * 100:.2f}%")
-                print(f"off by 5+: {(float(v["off_by_5"].load()) / (len(toks) * k)) * 100:.2f}%")
-                print(f"missing  : {(float(v["missing"].load()) / (len(toks) * k)) * 100:.2f}%\n")
-main()