Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 5 additions & 19 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,41 +67,33 @@ name = "pruna_internal"
url = "https://prunaai.pythonanywhere.com/simple/"
explicit = true

[[tool.uv.index]]
name = "intel-pytorch-extension"
url = "https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/"
explicit = true

[tool.uv]
index-strategy = "first-index"
exclude-newer = "1 week" # protection against compromised dependencies
# trusted dev wheels that are missing an upload date
exclude-newer-package = { gptqmodel = false, "stable-fast-pruna" = false }

conflicts = [
[{ extra = "awq" }, { extra = "vbench" }],
[{ extra = "vllm" }, { extra = "vbench" }],
[{ extra = "intel" }, { extra = "awq" }],
[{ extra = "gptq" }, { extra = "awq" }],
# intel is incompatible with all stable-fast variants and vllm
[{ extra = "intel" }, { extra = "stable-fast" }, { extra = "stable-fast-extraindex" }],
[{ extra = "intel" }, { extra = "full" }, { extra = "stable-fast-extraindex" }],
[{ extra = "intel" }, { extra = "vllm" }],
[{ extra = "kvpress" }, { extra = "vbench" }],
]

[tool.uv.sources]
gptqmodel = { index = "pruna_internal", marker = "sys_platform != 'darwin' or platform_machine != 'arm64'" }
intel-extension-for-pytorch = { index = "intel-pytorch-extension" }
stable-fast-pruna = { index = "pruna_internal", extra = "stable-fast-extraindex" }

[project]
name = "pruna"
version = "0.3.2"
version = "0.3.3"
description = "Smash your AI models"
authors = [
{name = "Pruna AI", email = "hello@pruna.ai"}
]
license = {file = "LICENSE"}
readme = "README.md"
requires-python = ">=3.10,<3.13"
requires-python = ">=3.10,<3.14"
keywords = ["AI", "machine learning", "model optimization", "pruning"]
classifiers = [
"Development Status :: 4 - Beta",
Expand Down Expand Up @@ -246,12 +238,6 @@ lmharness = [
"lm-eval>=0.4.0"
]

# Intel extension is tightly coupled with the torch version
intel = [
"intel-extension-for-pytorch>=2.7.0",
"torch>=2.7.0,<2.9.0",
"torchvision>=0.22.0,<0.24.0",
]
kvpress = [
"kvpress>=0.5.2",
]
Expand Down
31 changes: 12 additions & 19 deletions src/pruna/evaluation/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,19 +66,19 @@ class BenchmarkRegistry:
paper (see reference URL). All entries verified from paper evaluation
sections (ar5iv/HTML or PDF) as of verification pass:

- Parti Prompts (2206.10789 §5.2, §5.4): human side-by-side only on P222.
- DrawBench (2205.11487 §4.3): human raters only; COCO uses FID + CLIP.
- Parti Prompts (2206.10789 ?5.2, ?5.4): human side-by-side only on P222.
- DrawBench (2205.11487 ?4.3): human raters only; COCO uses FID + CLIP.
- GenAI Bench (2406.13743): VQAScore only (web/PWC; ar5iv failed).
- VBench (2311.17982): 16 dimension-specific methods; no single Pruna metric.
- COCO (2205.11487 §4.1): FID and CLIP score for fidelity and alignment.
- ImageNet (1409.0575 §4): top-1/top-5 classification accuracy.
- WikiText (1609.07843 §5): perplexity on validation/test.
- GenEval (2310.11513 §3.2): Mask2Former + CLIP color pipeline, binary score.
- COCO (2205.11487 ?4.1): FID and CLIP score for fidelity and alignment.
- ImageNet (1409.0575 ?4): top-1/top-5 classification accuracy.
- WikiText (1609.07843 ?5): perplexity on validation/test.
- GenEval (2310.11513 ?3.2): Mask2Former + CLIP color pipeline, binary score.
- HPS (2306.09341): HPS v2 scoring model (CLIP fine-tuned on HPD v2).
- ImgEdit (2505.20275 §4.2): GPT-4o 1���5 ratings and ImgEdit-Judge.
- Long Text Bench (2507.22058 §4): Text Accuracy (OCR, Qwen2.5-VL-7B).
- GEditBench (2504.17761 §4.2): VIEScore (SQ, PQ, O via GPT-4.1/Qwen2.5-VL).
- OneIG (2506.07977 §4.1): per-dimension metrics (semantic alignment, ED, etc.).
- ImgEdit (2505.20275 ?4.2): GPT-4o 1���5 ratings and ImgEdit-Judge.
- Long Text Bench (2507.22058 ?4): Text Accuracy (OCR, Qwen2.5-VL-7B).
- GEditBench (2504.17761 ?4.2): VIEScore (SQ, PQ, O via GPT-4.1/Qwen2.5-VL).
- OneIG (2506.07977 ?4.1): per-dimension metrics (semantic alignment, ED, etc.).
- DPG (2403.05135): DSG-style graph score, mPLUG-large adjudicator.
"""

Expand Down Expand Up @@ -174,7 +174,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
"Covers basic skills (scene, attributes, spatial relationships) to advanced reasoning "
"(counting, comparison, logic/negation) with over 24k human ratings."
),
metrics=["vqa", "clip_score"],
metrics=[], # Paper uses VQAScore only; not in Pruna
task_type="text_to_image",
reference="https://arxiv.org/abs/2406.13743",
),
Expand All @@ -195,7 +195,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
"MS-COCO for text-to-image evaluation (Imagen, 2205.11487). Paper reports "
"FID for fidelity and CLIP score for image-text alignment."
),
metrics=["fid", "clip_score"], # §4.1: FID + CLIP score
metrics=["fid", "clip_score"], # ?4.1: FID + CLIP score
task_type="text_to_image",
reference="https://arxiv.org/abs/2205.11487",
),
Expand Down Expand Up @@ -285,13 +285,6 @@ def list(cls, task_type: str | None = None) -> list[str]:
task_type="text_to_image",
reference="https://arxiv.org/abs/2506.07977",
),
Benchmark(
name="OneIG Knowledge Reasoning",
description="OneIG subset: knowledge- and reasoning-heavy prompts.",
metrics=["oneig_reasoning"],
task_type="text_to_image",
reference="https://arxiv.org/abs/2506.07977",
),
Benchmark(
name="OneIG Multilingualism",
description="OneIG subset: multilingual prompts (incl. Chinese splits).",
Expand Down
8 changes: 5 additions & 3 deletions src/pruna/evaluation/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,16 @@
from pruna.evaluation.metrics.metric_evalharness import LMEvalMetric
from pruna.evaluation.metrics.metric_memory import DiskMemoryMetric, InferenceMemoryMetric, TrainingMemoryMetric
from pruna.evaluation.metrics.metric_model_architecture import TotalMACsMetric, TotalParamsMetric
from pruna.evaluation.metrics.metric_pairwise_clip import PairwiseClipScore
from pruna.evaluation.metrics.metric_oneig_alignment import OneIGAlignmentMetric
from pruna.evaluation.metrics.metric_oneig_reasoning import OneIGReasoningMetric
from pruna.evaluation.metrics.metric_pairwise_clip import PairwiseClipScore
from pruna.evaluation.metrics.metric_qa_accuracy import QAAccuracyMetric
from pruna.evaluation.metrics.metric_text_score import OneIGTextScoreMetric, TextScoreMetric
from pruna.evaluation.metrics.metric_vqa import VQAMetric
from pruna.evaluation.metrics.metric_rapiddata import RapidataMetric as RapidataMetric
from pruna.evaluation.metrics.metric_sharpness import SharpnessMetric
from pruna.evaluation.metrics.metric_text_score import OneIGTextScoreMetric, TextScoreMetric
from pruna.evaluation.metrics.metric_torch import TorchMetricWrapper
from pruna.evaluation.metrics.metric_vie_score import VieScoreMetric
from pruna.evaluation.metrics.metric_vqa import VQAMetric
from pruna.evaluation.metrics.vlm_base import (
BaseVLM,
LitellmVLM,
Expand Down Expand Up @@ -65,6 +66,7 @@
"RapidataMetric",
"TextScoreMetric",
"VQAMetric",
"VieScoreMetric",
"BaseVLM",
"LitellmVLM",
"StatefulVLMMeanScoresMetric",
Expand Down
13 changes: 4 additions & 9 deletions src/pruna/evaluation/metrics/metric_oneig_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,6 @@ class OneIGAlignmentMetric(QAAccuracyMetric):
(default ``2 x 2``), score **one question per VLM call** across all cells, apply
dependency masking per cell, then average cell scores.

Scoring semantics
-----------------
OneIG Q_D probes are phrased so **Yes = aligned**. Each call requests
:meth:`~pruna.evaluation.metrics.vlm_base.BaseVLM.score` with expected answer
``"Yes"`` (probability of Yes). Low scores act as semantic **No** for dependency
Expand All @@ -178,11 +176,9 @@ class OneIGAlignmentMetric(QAAccuracyMetric):
api_key : str | None, optional
API key for litellm.
call_type : str, optional
Call type for the metric.
aggregation : str, optional
Unused; kept for registry compatibility with :class:`QAAccuracyMetric`.
Call type for the metric (``"single"`` or ``"pairwise"``).
**kwargs : Any
Additional keyword arguments for :class:`QAAccuracyMetric`.
Forwarded to :class:`QAAccuracyMetric` (e.g. ``aggregation``).

Examples
--------
Expand All @@ -199,7 +195,6 @@ class OneIGAlignmentMetric(QAAccuracyMetric):

def __init__(
self,
*args: Any,
grid_size: tuple[int, int] = (2, 2),
vlm: Any | None = None,
vlm_type: Literal["litellm", "transformers"] = "transformers",
Expand All @@ -212,18 +207,18 @@ def __init__(
**kwargs: Any,
) -> None:
super().__init__(
*args,
vlm=vlm,
vlm_type=vlm_type,
model_name=model_name,
vlm_kwargs=vlm_kwargs,
structured_output=structured_output,
device=device,
api_key=api_key,
call_type=call_type if call_type is not None else "y_gt",
call_type=call_type,
**kwargs,
)
self.grid_size = (int(grid_size[0]), int(grid_size[1]))
self.metric_units = type(self).metric_units

def _score_sample(self, image: Any, aux: dict[str, Any]) -> float:
if not isinstance(image, Image.Image):
Expand Down
11 changes: 5 additions & 6 deletions src/pruna/evaluation/metrics/metric_qa_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@ class QAAccuracyMetric(StatefulVLMMeanScoresMetric):

Parameters
----------
*args : Any
Additional positional arguments.
vlm : BaseVLM | None, optional
Custom VLM instance. If provided, ``vlm_type`` and ``model_name`` are ignored.
vlm_type : {"litellm", "transformers"}, optional
Expand All @@ -76,8 +74,10 @@ class QAAccuracyMetric(StatefulVLMMeanScoresMetric):
API key for litellm.
call_type : str, optional
Call type for the metric.
aggregation : {"mean", "all_or_nothing"}, optional
Per-image score aggregation (keyword-only). Default is ``"mean"``.
**kwargs : Any
Supports ``aggregation``: ``"mean"`` or ``"all_or_nothing"``.
Additional keyword arguments forwarded to the parent class.

Raises
------
Expand Down Expand Up @@ -111,15 +111,14 @@ class QAAccuracyMetric(StatefulVLMMeanScoresMetric):

def __init__(
self,
*args,
vlm: BaseVLM | None = None,
vlm_type: Literal["litellm", "transformers"] = "litellm",
model_name: str | None = None,
vlm_kwargs: dict | None = None,
structured_output: bool = True,
device: str | torch.device | None = None,
api_key: str | None = None,
call_type: str = SINGLE,
call_type: str | None = None,
*,
aggregation: str = "mean",
**kwargs: Any,
Expand All @@ -139,7 +138,7 @@ def __init__(
structured_output=structured_output,
device=device,
api_key=api_key,
call_type=call_type,
call_type=call_type if call_type is not None else SINGLE,
)

def _extract_questions(self, gt: Any, n: int) -> list[list[str]]:
Expand Down
47 changes: 30 additions & 17 deletions src/pruna/evaluation/metrics/metric_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,26 @@
)
from pruna.logging.logger import pruna_logger

_PRUNA_TASK_ROUTING_KWARGS: tuple[str, ...] = (
"vlm_type",
"model_name",
"structured_output",
"vlm_kwargs",
"api_key",
)


def _strip_task_routing_kwargs(kwargs: dict[str, Any]) -> None:
"""
Drop kwargs :class:`~pruna.evaluation.task.Task` passes when building mixed metric lists.

Torchmetrics classes often end with ``**kwargs`` and would otherwise accept bogus keys
until a lower layer raises. Stripping here keeps :class:`TorchMetricWrapper` the single
choke point between Pruna routing and torchmetrics constructors.
"""
for key in _PRUNA_TASK_ROUTING_KWARGS:
kwargs.pop(key, None)


def default_update(metric: Metric, *args, **kwargs) -> None:
"""
Expand Down Expand Up @@ -124,9 +144,7 @@ def arniqa_update(metric: ARNIQA, preds: Any) -> None:


def ssim_update(
metric: StructuralSimilarityIndexMeasure | MultiScaleStructuralSimilarityIndexMeasure,
preds: Any,
target: Any
metric: StructuralSimilarityIndexMeasure | MultiScaleStructuralSimilarityIndexMeasure, preds: Any, target: Any
) -> None:
"""
Update handler for SSIM or MS-SSIM metric.
Expand All @@ -152,29 +170,22 @@ class TorchMetrics(Enum):
"""
Enumeration of torchmetrics metrics for evaluation.

This enum provides a tuple per member (metric_factory, update_fn, call_type):
metric_factory builds the metric (typically a torchmetrics class, or
functools.partial when some constructor arguments are fixed); update_fn is
an optional custom update handler; call_type describes how inputs are paired
for the metric.
Each member value is a ``(metric_factory, update_fn, call_type)`` tuple.

Parameters
----------
value : tuple
Tuple holding metric_factory, update_fn, and call_type as described above.
``(metric_factory, update_fn, call_type)`` for this enum member.
names : str
The name of the enum member.
Enum member name.
module : str
The module where the enum is defined.
Defining module name.
qualname : str
The qualified name of the enum.
Qualified name of the enum class.
type : type
The type of the enum.
Enum metaclass type.
start : int
The start index for auto-numbering enum values.
boundary : enum.FlagBoundary or None
Boundary handling mode used by the Enum functional API for Flag and
IntFlag enums.
Auto-numbering start index for functional API enums.
"""

fid = (FrechetInceptionDistance, fid_update, "gt_y")
Expand Down Expand Up @@ -246,6 +257,7 @@ def __new__(cls, metric_name: str, call_type: str = "", **kwargs) -> StatefulMet
if metric_name == "clip_score" and call_type.startswith(PAIRWISE):
from pruna.evaluation.metrics.metric_pairwise_clip import PairwiseClipScore

_strip_task_routing_kwargs(kwargs)
return PairwiseClipScore(**kwargs)
return super().__new__(cls)

Expand All @@ -259,6 +271,7 @@ def __init__(self, metric_name: str, call_type: str = "", **kwargs) -> None:
If the metric name is not supported.
"""
self.metric_name = metric_name
_strip_task_routing_kwargs(kwargs)
super().__init__(kwargs.pop("device", None))
try:
self.metric = TorchMetrics[metric_name](**kwargs)
Expand Down
Loading