Object Detection On Coco Minival
المقاييس
AP50
AP75
APL
APM
APS
box AP
النتائج
نتائج أداء النماذج المختلفة على هذا المعيار القياسي
جدول المقارنة
اسم النموذج | AP50 | AP75 | APL | APM | APS | box AP |
---|---|---|---|---|---|---|
resnest-split-attention-networks | 71.00 | 57.07 | 66.29 | 56.36 | 36.80 | 52.47 |
bottom-up-object-detection-by-grouping | 55.1 | 43.7 | 56.1 | 44.0 | 21.6 | 40.3 |
mask-r-cnn | 59.5 | 38.9 | - | - | - | 36.7 |
feature-pyramid-networks-for-object-detection | 61.3 | 43.3 | 52.6 | 43.3 | 22.9 | 39.8 |
hiera-a-hierarchical-vision-transformer | - | - | - | - | - | 55 |
moat-alternating-mobile-convolution-and | - | - | - | - | - | 58.5 |
a-strong-and-reproducible-object-detector | 81.5 | 71.4 | 78.5 | 68.5 | 50.4 | 64.6 |
bottleneck-transformers-for-visual | 71 | 54.2 | - | - | - | 49.5 |
resnest-split-attention-networks | 69.53 | 55.40 | 65.83 | 54.66 | 32.67 | 50.91 |
cascade-r-cnn-delving-into-high-quality | 61.6 | 46.6 | 57.4 | 46.2 | 23.8 | 42.7 |
non-local-neural-networks | 63.1 | 44.5 | - | - | - | 40.8 |
end-to-end-object-detection-with-transformers | 64.7 | 47.7 | 62.3 | 49.5 | 23.7 | 44.9 |
xcit-cross-covariance-image-transformers | - | - | - | - | - | 48.1 |
rethinking-and-improving-relative-position | - | - | - | - | - | 40.8 |
swin-transformer-v2-scaling-up-capacity-and | - | - | - | - | - | 62.5 |
deformable-convnets-v2-more-deformable-better | - | - | - | - | - | 43.1 |
gcnet-non-local-networks-meet-squeeze | 62.4 | 44 | 52.5 | 44.4 | 24.2 | 40.3 |
reppoints-point-set-representation-for-object | - | - | - | - | - | 46.8 |
a-ranking-based-balanced-loss-function | 58.8 | 41.5 | - | - | - | 39.7 |
a-ranking-based-balanced-loss-function | 60.7 | 43.3 | - | - | - | 40.7 |
deep-residual-learning-for-image-recognition | 61.9 | 47.0 | - | - | - | 43.5 |
elsa-enhanced-local-self-attention-for-vision | 70.5 | 56.0 | - | - | - | 51.6 |
190807919 | - | - | 62.2 | 50.3 | 28.8 | 47.0 |
grid-r-cnn | 58.3 | 42.4 | 51.5 | 43.8 | 22.6 | 39.6 |
centermask-real-time-anchor-free-instance-1 | - | - | 58.8 | - | 29.2 | 45.6 |
group-normalization | 61.6 | 44.4 | - | - | - | 40.8 |
recurrent-glimpse-based-decoder-for-detection | 67.5 | 53.1 | 65 | 52.6 | 30 | 49.1 |
rethinking-imagenet-pre-training | 66.8 | 52.9 | - | - | - | 48.6 |
sparse-r-cnn-end-to-end-object-detection-with | 64.6 | 49.5 | 61.6 | 48.3 | 28.3 | 45.6 |
190909777 | 55.3 | - | - | - | - | 35.6 |
cornernet-detecting-objects-as-paired | 53.8 | 40.9 | 51.8 | 40.5 | 18.6 | 38.4 |
a-novel-region-of-interest-extraction-layer | 59.9 | 41.7 | 49.7 | 42.1 | 22.9 | 38.4 |
190408900 | - | - | 57.1 | 43.5 | 23.8 | 41.4 |
eva-exploring-the-limits-of-masked-visual | 82.1 | 70.8 | 78.5 | 68.4 | 49.4 | 64.5 |
pyramid-vision-transformer-a-versatile | 63.6 | 46.1 | 59.5 | 46.0 | 26.1 | 43.4 |
reducing-label-noise-in-anchor-free-object | 59.5 | 44.2 | 52.3 | 44.7 | 25.4 | 40.5 |
efficientdet-scalable-and-efficient-object | 73.4 | 59.0 | 67.9 | 58.0 | 40.0 | - |
reppoints-point-set-representation-for-object | - | - | - | - | - | 40.3 |
end-to-end-object-detection-with-transformers | 63.9 | 47.8 | 56 | 48.1 | 27.2 | 44 |
houghnet-integrating-near-and-long-range | 62.2 | 46.9 | 55.8 | 47.6 | 25.5 | 43.0 |
context-autoencoder-for-self-supervised | - | - | - | - | - | 54.5 |
conditional-detr-for-fast-training | 65.4 | 48.5 | 62.2 | 49 | 25.3 | 45.1 |
190408900 | - | - | 58.4 | 44.3 | 25.5 | 42.6 |
bottleneck-transformers-for-visual | 71.3 | 54.6 | - | - | - | 49.7 |
improved-multiscale-vision-transformers-for | - | - | - | - | - | 58.7 |
attentive-normalization | 66.2 | 49.1 | - | - | - | 44.9 |
centermask-real-time-anchor-free-instance-1 | 67.8 | - | - | - | - | 48.6 |
exploring-plain-vision-transformer-backbones | - | - | - | - | - | 60.4 |
retinamask-learning-to-predict-masks-improves | 60.2 | 44.1 | - | - | - | 41.1 |
rethinking-and-improving-relative-position | - | - | - | - | - | 42.3 |
bottom-up-object-detection-by-grouping | 59.6 | 46.8 | 59.4 | 46.6 | 25.7 | 43.3 |
masked-autoencoders-are-scalable-vision | - | - | - | - | - | 53.3 |
rethinking-pre-training-and-self-training | - | - | - | - | - | 54.2 |
mask-r-cnn | - | - | - | - | - | 37.7 |
anchor-detr-query-design-for-transformer | 65.7 | 48.8 | 61.6 | 49.4 | 25.8 | 45.1 |
detrs-with-collaborative-hybrid-assignments | - | - | - | - | - | 65.9 |
dn-detr-accelerate-detr-training-by | 67.6 | 53.8 | 65.4 | 52.6 | 31.3 | 49.5 |
conditional-detr-for-fast-training | 64 | 45.7 | 61.5 | 46.7 | 22.7 | 43 |
resnest-split-attention-networks | 68.78 | 55.17 | 63.9 | 54.2 | - | 50.54 |
a-ranking-based-balanced-loss-function | 60.3 | 42.3 | - | - | - | 40.2 |
dino-detr-with-improved-denoising-anchor-1 | 69.1 | 56 | 65.8 | 54.2 | 34.5 | 51.3 |
vit-comer-vision-transformer-with | - | - | - | - | - | 64.3 |
foveabox-beyond-anchor-based-object-detector | 57.8 | 40.5 | - | - | - | 38.1 |
moat-alternating-mobile-convolution-and | - | - | - | - | - | 57.7 |
general-object-foundation-model-for-images | - | - | - | - | - | 62.0 |
gradient-harmonized-single-stage-detector | 55.5 | 38.1 | 46.7 | 39.6 | 19.6 | 35.8 |
dynamic-head-unifying-object-detection-heads | 78.2 | - | 74.2 | - | - | 60.3 |
efficientdet-scalable-and-efficient-object | - | - | - | - | - | 52.1 |
centermask-real-time-anchor-free-instance-1 | - | - | 57.7 | - | 28.5 | 44.9 |
virtex-learning-visual-representations-from | - | - | - | - | - | 40.9 |
towards-all-in-one-pre-training-via | - | - | - | - | - | 65.0 |
yolov6-v3-0-a-full-scale-reloading | 74.5 | - | - | - | - | 57.2 |
pix2seq-a-language-modeling-framework-for | - | - | - | - | - | 47.3 |
grid-r-cnn | 60.3 | 44.4 | 54.1 | 45.8 | 23.4 | 41.3 |
augmenting-convolutional-networks-with | - | - | - | - | - | 46.4 |
a-novel-region-of-interest-extraction-layer | 59.2 | 40.6 | 47.8 | 41.5 | 22.3 | 37.5 |
recursively-refined-r-cnn-instance | 64.1 | 48.4 | 58.9 | 47.1 | 27 | 44.3 |
internimage-exploring-large-scale-vision | - | - | - | - | - | 65.0 |
metaformer-is-actually-what-you-need-for | 63.1 | 44.8 | - | - | - | 41.0 |
centernet-object-detection-with-keypoint | 59.2 | 43.9 | 55.8 | 43.8 | 23.6 | 41.3 |
uniform-masking-enabling-mae-pre-training-for | - | - | - | - | - | 57.4 |
moat-alternating-mobile-convolution-and | - | - | - | - | - | 59.2 |
190807919 | 59.2 | 44.9 | 54.1 | 44.2 | 23.7 | 41.3 |
pyramid-vision-transformer-a-versatile | 63.7 | 45.4 | 58.4 | 46.0 | 25.8 | 42.6 |
190807919 | - | - | - | 45.4 | 25.0 | 42.3 |
improved-multiscale-vision-transformers-for | - | - | - | - | - | 54.3 |
deformable-convnets-v2-more-deformable-better | - | - | 58.7 | 45.8 | 22.2 | 41.7 |
centermask-real-time-anchor-free-instance-1 | - | - | - | 48.3 | 27.7 | 44.6 |
sparse-r-cnn-end-to-end-object-detection-with | 62.1 | 47.2 | 59.7 | 46.3 | 26.1 | 43.5 |
dino-detr-with-improved-denoising-anchor-1 | 69 | 55.8 | 65.3 | 54.3 | 35 | 51.2 |
scale-aware-trident-networks-for-object | 63.5 | 45.5 | 56.9 | 47 | 24.9 | 42 |
transnext-robust-foveal-visual-perception-for | - | - | - | - | - | 55.7 |
improved-multiscale-vision-transformers-for | - | - | - | - | - | 56.1 |
conditional-detr-for-fast-training | 66.8 | 49.5 | 63.3 | 50.3 | 27.2 | 45.9 |
pix2seq-a-language-modeling-framework-for | - | - | - | - | - | 50.0 |
augmenting-convolutional-networks-with | - | - | - | - | - | 47.0 |
hybrid-task-cascade-for-instance-segmentation | 59.4 | 40.7 | 52.3 | 40.9 | 20.3 | 43.2 |
xcit-cross-covariance-image-transformers | - | - | - | - | - | 48.5 |
spinenet-learning-scale-permuted-backbone-for | - | - | - | - | - | 52.2 |
end-to-end-semi-supervised-object-detection | - | - | - | - | - | 60.1 |
190807919 | - | - | - | 47.9 | 26.1 | - |
res2net-a-new-multi-scale-backbone | 53.6 | - | 51.1 | 38.3 | 14 | 33.7 |
mask-r-cnn | - | - | - | - | - | 40.0 |
pix2seq-a-language-modeling-framework-for | - | - | - | - | - | 42.6 |
general-object-foundation-model-for-images | - | - | - | - | - | 55.0 |
exploring-plain-vision-transformer-backbones | - | - | - | - | - | 61.3 |
feature-selective-anchor-free-module-for | 55.0 | 37.9 | 48.2 | 39.6 | 19.8 | 35.9 |
reppoints-point-set-representation-for-object | - | - | - | - | - | 40.8 |
simple-copy-paste-is-a-strong-data | - | - | - | - | - | 54.5 |
pix2seq-a-language-modeling-framework-for | - | - | - | - | - | 47.1 |
feature-selective-anchor-free-module-for | 62.4 | - | - | - | - | 41.6 |
moat-alternating-mobile-convolution-and | - | - | - | - | - | 55.9 |
non-local-neural-networks | 67.8 | 48.9 | - | - | - | 45.0 |
res2net-a-new-multi-scale-backbone | 66.5 | 51.3 | 62.1 | 51.6 | 28.6 | 47.5 |
when-shift-operation-meets-vision-transformer | - | - | - | 42.3 | - | - |
2103-15358 | - | 47.6 | 58.1 | 48 | 29.9 | 44.7 |
centermask-real-time-anchor-free-instance-1 | - | - | 57.1 | - | 26.7 | 44.4 |
houghnet-integrating-near-and-long-range | 64.6 | 50.3 | 59.7 | 48.8 | 30.0 | 46.1 |
190807919 | 58.9 | 41.5 | 49.6 | 40.8 | 22.6 | 38.0 |
davit-dual-attention-vision-transformers | - | - | - | - | - | 49.9 |
gcnet-non-local-networks-meet-squeeze | 66.9 | 52.2 | - | - | - | 47.9 |
recursively-refined-r-cnn-instance | 61 | 46.3 | 55.7 | 45.2 | 24.5 | 42 |
190807919 | 62.7 | 48.7 | 58.5 | 48.1 | 26.3 | 44.6 |
universal-instance-perception-as-object | 77.5 | 66.7 | 75.3 | 64.8 | 45.1 | 60.6 |
improved-multiscale-vision-transformers-for | - | - | - | - | - | 52.7 |
dino-detr-with-improved-denoising-anchor-1 | - | - | - | - | - | 63.2 |
swin-transformer-hierarchical-vision | - | - | - | - | - | 57.1 |
usb-universal-scale-object-detection | 69.5 | 55.4 | 65.8 | 55.5 | 33.5 | 50.9 |
transnext-robust-foveal-visual-perception-for | - | - | - | - | - | 57.1 |
moat-alternating-mobile-convolution-and | - | - | - | - | - | 55.2 |
general-object-foundation-model-for-images | - | - | - | - | - | 60.4 |
cbnetv2-a-composite-backbone-network | - | - | - | - | - | 59.6 |
dynamic-head-unifying-object-detection-heads | - | - | - | - | - | 46.5 |
recursively-refined-r-cnn-instance | 64.3 | 48.9 | 59.6 | 48.3 | 26.6 | 44.8 |
recursively-refined-r-cnn-instance | 61.2 | 45.6 | - | - | 24.4 | - |
usb-universal-scale-object-detection | 70.8 | 58.9 | 68.1 | 57.5 | 36.9 | 53.5 |
190807919 | 62.8 | 45.9 | 54.6 | 44.7 | - | 41.8 |
reppoints-point-set-representation-for-object | - | - | - | - | - | 46.4 |
cp-detr-concept-prompt-guide-detr-toward | - | - | - | - | - | 64.1 |
pvtv2-improved-baselines-with-pyramid-vision | 69.5 | 54.9 | - | - | - | 50.1 |
fcos-fully-convolutional-one-stage-object | 57.4 | 41.4 | 49.8 | 42.5 | 22.3 | 38.6 |
global-context-networks | 70.4 | 56.1 | - | - | - | 51.8 |
sparse-r-cnn-end-to-end-object-detection-with | 61.2 | 45.7 | 57.6 | 44.6 | 26.7 | 42.3 |
conditional-detr-for-fast-training | 65.6 | 47.5 | 63.6 | 48.4 | 23.6 | 44.5 |
moat-alternating-mobile-convolution-and | - | - | - | - | - | 50.5 |
dab-detr-dynamic-anchor-boxes-are-better-1 | 67 | 50.2 | 64.1 | 50.5 | 28.1 | 46.6 |
pix2seq-a-language-modeling-framework-for | 61.0 | 46.1 | 58.6 | 47 | 26.6 | 43.2 |
swin-transformer-hierarchical-vision | - | - | - | - | - | 58 |
feature-selective-anchor-free-module-for | 58.0 | - | - | - | - | 37.9 |
sparse-r-cnn-end-to-end-object-detection-with | 63.4 | 48.2 | 59.5 | 47.2 | 26.9 | 44.5 |
simple-training-strategies-and-model-scaling | - | - | 70.3 | 56.2 | 33.9 | 53.1 |
revisiting-efficient-object-detection | 65.5 | 52.2 | 61.1 | 51.9 | 30.3 | 47.8 |
focal-self-attention-for-local-global | 77.2 | - | 73.4 | - | - | 58.7 |
deep-residual-learning-for-image-recognition | 63.0 | 48.3 | - | - | - | 44.5 |
190807919 | - | - | 60.1 | - | 27.5 | 46.0 |
you-only-learn-one-representation-unified | 70.6 | 57.4 | 65.2 | 57.3 | 37.4 | - |
m2det-a-single-shot-object-detector-based-on | 53.7 | - | 49.3 | 39.5 | 15.9 | 34.1 |
x-volution-on-the-unification-of-convolution | 64 | 46.4 | 55 | 46 | 26.9 | 42.8 |
vision-transformer-adapter-for-dense | - | - | - | - | - | 60.2 |
foveabox-beyond-anchor-based-object-detector | 57.8 | 40.2 | 52.7 | 42.2 | 19.5 | 38 |
m2det-a-single-shot-object-detector-based-on | 52.2 | - | 49.1 | 38.2 | 15 | 33.2 |
group-normalization | 61 | 44 | - | - | - | 40.3 |
moat-alternating-mobile-convolution-and | - | - | - | - | - | 53.0 |
understanding-the-robustness-in-vision | - | - | - | - | - | 55.1 |
rethinking-imagenet-pre-training | 67.1 | 51.1 | - | - | - | 46.4 |
2103-15358 | 65.5 | 47.1 | 58.3 | 47.9 | 28.9 | 44.3 |
could-giant-pretrained-image-models-extract | - | - | - | - | - | 59.3 |
weight-standardization | 64.15 | 47.11 | 56.39 | 47.19 | 25.49 | 43.12 |
reppoints-point-set-representation-for-object | - | - | - | - | - | 38.6 |
anchor-detr-query-design-for-transformer | 64.7 | 47.5 | 60.6 | 48.2 | 24.7 | 44.2 |
190807919 | - | - | 51.0 | 41.7 | - | 39.2 |
vision-transformer-adapter-for-dense | - | - | - | - | - | 60.5 |
bottleneck-transformers-for-visual | - | - | - | - | - | 45.9 |
feature-selective-anchor-free-module-for | 59.2 | - | - | - | - | 39.3 |
focal-modulation-networks | - | - | - | - | - | 64.2 |
scaled-yolov4-scaling-cross-stage-partial | 73.3 | 60.7 | 67.4 | 59.5 | 38.1 | 55.4 |
florence-a-new-foundation-model-for-computer | - | - | - | - | - | 62 |
end-to-end-semi-supervised-object-detection | - | - | - | - | - | 60.7 |
queryinst-parallelly-supervised-mask-query | 75.8 | 61.7 | 71.5 | 59.8 | 40.2 | 56.1 |
reversible-column-networks | - | - | - | - | - | 63.8 |
reppoints-point-set-representation-for-object | - | - | - | - | - | 44.5 |
detrs-with-collaborative-hybrid-assignments | - | - | - | - | - | 64.7 |
focal-modulation-networks | 70.3 | 56.0 | - | - | - | 51.5 |
focal-modulation-networks | 70.1 | 55.8 | - | - | - | - |
simple-copy-paste-is-a-strong-data | - | - | - | - | - | 57.0 |
activemlp-an-mlp-like-architecture-with | - | - | - | - | - | 52.3 |
moat-alternating-mobile-convolution-and | - | - | - | - | - | 51.9 |
foveabox-beyond-anchor-based-object-detector | 58.4 | 41.5 | 51.7 | 43.5 | 22.3 | 38.9 |
grounded-language-image-pre-training | - | - | - | - | - | 60.8 |
transnext-robust-foveal-visual-perception-for | - | - | - | - | - | 56.6 |
masked-autoencoders-are-scalable-vision | - | - | - | - | - | 50.3 |
190807919 | 61.8 | 44.8 | 53.3 | 43.7 | 24.4 | 40.9 |
group-normalization | 62.8 | 46.2 | - | - | - | 42.3 |
190807919 | 61.7 | 47.7 | 57.4 | 46.5 | 25.6 | 43.7 |
dynamic-head-unifying-object-detection-heads | 76.8 | - | 73.2 | 62.2 | 44.5 | 58.4 |
simple-training-strategies-and-model-scaling | - | - | 70.6 | 56.7 | 34.5 | 53.6 |
hornet-efficient-high-order-spatial | - | - | - | - | - | 59.2 |
lip-local-importance-based-pooling | 63.6 | 45.6 | - | 45.8 | 25.2 | 41.7 |
190807919 | - | - | 59.5 | 48.4 | 27.0 | 45.3 |
grounding-dino-marrying-dino-with-grounded | - | - | - | - | - | 63.0 |
solq-segmenting-objects-by-learning-queries | 74.9 | 61.3 | 71.9 | - | - | - |
adaptively-connected-neural-networks | - | - | - | - | - | 39.5 |
deep-residual-learning-for-image-recognition | 64.3 | 50.5 | - | - | - | 46.3 |
pix2seq-a-language-modeling-framework-for | 63.2 | 48.6 | 60.4 | 48.9 | 28.2 | 45.0 |
usb-universal-scale-object-detection | 67.0 | 52.6 | 62.7 | 52.7 | 30.6 | 48.5 |
dynamic-head-unifying-object-detection-heads | - | - | 66.3 | - | - | - |
cbnetv2-a-composite-backbone-network | - | - | - | - | - | 59.1 |
rethinking-imagenet-pre-training | - | - | - | - | - | 47.4 |
libra-r-cnn-towards-balanced-learning-for | 59.3 | 42.0 | 50.5 | 42.1 | 22.9 | 38.5 |
reppoints-point-set-representation-for-object | - | - | - | - | - | 44.8 |
internimage-exploring-large-scale-vision | - | - | - | - | - | 64.2 |
190807919 | - | - | - | 46.0 | 26.6 | 43.1 |
elsa-enhanced-local-self-attention-for-vision | 70.4 | 52.9 | - | - | - | 48.3 |
towards-sustainable-self-supervised-learning | - | - | - | - | - | 54.6 |
cascade-r-cnn-delving-into-high-quality | 59.4 | 43.7 | 54.1 | 43.7 | 22.9 | 40.3 |
non-local-neural-networks | 61.1 | 41.9 | - | - | - | 39.0 |
you-only-learn-one-representation-unified | 73.5 | 60.6 | 68.7 | 60.1 | 40.4 | - |
foveabox-beyond-anchor-based-object-detector | 55.2 | 37.9 | 50.5 | 39.4 | 18.6 | 36.0 |
dab-detr-dynamic-anchor-boxes-are-better-1 | 64.7 | 47.2 | 62.9 | 48.2 | 24.1 | 44.1 |