-
Notifications
You must be signed in to change notification settings - Fork 143
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
490 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# COCO-Stuff-10K | ||
|
||
<!-- [ALGORITHM] --> | ||
|
||
## Introduction | ||
|
||
The Common Objects in COntext-stuff (COCO-stuff) dataset is a dataset for scene understanding tasks like semantic segmentation, object detection and image captioning. It is constructed by annotating the original COCO dataset, which originally annotated things while neglecting stuff annotations. There are 10k images in COCO-Stuff-10K dataset that span over 172 categories including 80 things, 91 stuff, and 1 unlabeled class. It is split into 9,000 and 1,000 images for training and testing. | ||
|
||
## Results and Models | ||
|
||
| Method | Backbone | Pre-train | Batch Size | Lr schd | Crop Size | mIoU (SS) | mIoU (MS) | #Param | Config | Download | | ||
|:-----------:|:-------------:|:---------------------------------------------------------------------------------------------------------------------:|:----------:|:-------:|:---------:|:------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------:|:------:|:---------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| | ||
| UperNet | ViT-Adapter-L | [BEiT-L](https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth) | 8x2 | 80k | 512 | [51.0](https://drive.google.com/file/d/1xZodiAvOLGaLtMGx_btYVZIMC2VKrDhI/view?usp=sharing) | [51.4](https://drive.google.com/file/d/1bmFG9GA4bRqOEJfqXcO7nWYPwG3wSk2J/view?usp=sharing) | 451M | [config](./upernet_beit_adapter_large_512_80k_cocostuff10k_ss.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.2.4/upernet_beit_adapter_large_512_80k_cocostuff10k.pth.tar) \| [log](https://github.com/czczup/ViT-Adapter/releases/download/v0.2.4/20220505_091358.log) | | ||
| Mask2Former | ViT-Adapter-L | [BEiT-L](https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth) | 8x2 | 40k | 512 | [53.2](https://drive.google.com/file/d/1Buewc1n7GBAcBDXeia-QarujrDZqc_Sx/view?usp=sharing) | [54.2](https://drive.google.com/file/d/1kQgJUHDeQoO3pPY6QoXRKwyF7heT7wCJ/view?usp=sharing) | 568M | [config](./mask2former_beit_adapter_large_512_40k_cocostuff10k_ss.py) | [model]() \| [log]() | |
149 changes: 149 additions & 0 deletions
149
segmentation/configs/coco_stuff10k/mask2former_beit_adapter_large_512_40k_cocostuff10k_ms.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
# Copyright (c) Shanghai AI Lab. All rights reserved. | ||
_base_ = [ | ||
'../_base_/models/mask2former_beit_cocostuff.py', | ||
'../_base_/datasets/coco-stuff10k.py', | ||
'../_base_/default_runtime.py', | ||
'../_base_/schedules/schedule_40k.py' | ||
] | ||
crop_size = (512, 512) | ||
# pretrained = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth' | ||
pretrained = 'pretrained/beit_large_patch16_224_pt22k_ft22k.pth' | ||
model = dict( | ||
pretrained=pretrained, | ||
backbone=dict( | ||
type='BEiTAdapter', | ||
img_size=512, | ||
patch_size=16, | ||
embed_dim=1024, | ||
depth=24, | ||
num_heads=16, | ||
mlp_ratio=4, | ||
qkv_bias=True, | ||
use_abs_pos_emb=False, | ||
use_rel_pos_bias=True, | ||
init_values=1e-6, | ||
drop_path_rate=0.3, | ||
conv_inplane=64, | ||
n_points=4, | ||
deform_num_heads=16, | ||
cffn_ratio=0.25, | ||
deform_ratio=0.5, | ||
interaction_indexes=[[0, 5], [6, 11], [12, 17], [18, 23]], | ||
), | ||
decode_head=dict( | ||
in_channels=[1024, 1024, 1024, 1024], | ||
feat_channels=1024, | ||
out_channels=1024, | ||
num_queries=100, | ||
pixel_decoder=dict( | ||
type='MSDeformAttnPixelDecoder', | ||
num_outs=3, | ||
norm_cfg=dict(type='GN', num_groups=32), | ||
act_cfg=dict(type='ReLU'), | ||
encoder=dict( | ||
type='DetrTransformerEncoder', | ||
num_layers=6, | ||
transformerlayers=dict( | ||
type='BaseTransformerLayer', | ||
attn_cfgs=dict( | ||
type='MultiScaleDeformableAttention', | ||
embed_dims=1024, | ||
num_heads=32, | ||
num_levels=3, | ||
num_points=4, | ||
im2col_step=64, | ||
dropout=0.0, | ||
batch_first=False, | ||
norm_cfg=None, | ||
init_cfg=None), | ||
ffn_cfgs=dict( | ||
type='FFN', | ||
embed_dims=1024, | ||
feedforward_channels=4096, | ||
num_fcs=2, | ||
ffn_drop=0.0, | ||
act_cfg=dict(type='ReLU', inplace=True)), | ||
operation_order=('self_attn', 'norm', 'ffn', 'norm')), | ||
init_cfg=None), | ||
positional_encoding=dict( | ||
type='SinePositionalEncoding', num_feats=512, normalize=True), | ||
init_cfg=None), | ||
positional_encoding=dict( | ||
type='SinePositionalEncoding', num_feats=512, normalize=True), | ||
transformer_decoder=dict( | ||
type='DetrTransformerDecoder', | ||
return_intermediate=True, | ||
num_layers=9, | ||
transformerlayers=dict( | ||
type='DetrTransformerDecoderLayer', | ||
attn_cfgs=dict( | ||
type='MultiheadAttention', | ||
embed_dims=1024, | ||
num_heads=32, | ||
attn_drop=0.0, | ||
proj_drop=0.0, | ||
dropout_layer=None, | ||
batch_first=False), | ||
ffn_cfgs=dict( | ||
embed_dims=1024, | ||
feedforward_channels=4096, | ||
num_fcs=2, | ||
act_cfg=dict(type='ReLU', inplace=True), | ||
ffn_drop=0.0, | ||
dropout_layer=None, | ||
add_identity=True), | ||
feedforward_channels=4096, | ||
operation_order=('cross_attn', 'norm', 'self_attn', 'norm', | ||
'ffn', 'norm')), | ||
init_cfg=None) | ||
), | ||
test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)) | ||
) | ||
# dataset settings | ||
img_norm_cfg = dict( | ||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) | ||
train_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict(type='LoadAnnotations', reduce_zero_label=True), | ||
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), | ||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), | ||
dict(type='RandomFlip', prob=0.5), | ||
dict(type='PhotoMetricDistortion'), | ||
dict(type='Normalize', **img_norm_cfg), | ||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), | ||
dict(type='ToMask'), | ||
dict(type='DefaultFormatBundle'), | ||
dict(type='Collect', keys=['img', 'gt_semantic_seg', 'gt_masks', 'gt_labels']) | ||
] | ||
test_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict( | ||
type='MultiScaleFlipAug', | ||
img_scale=(2048, 512), | ||
img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], | ||
flip=True, | ||
transforms=[ | ||
dict(type='SETR_Resize', keep_ratio=True, | ||
crop_size=crop_size, setr_multi_scale=True), | ||
dict(type='RandomFlip'), | ||
dict(type='Normalize', **img_norm_cfg), | ||
dict(type='ImageToTensor', keys=['img']), | ||
dict(type='Collect', keys=['img']), | ||
]) | ||
] | ||
optimizer = dict(_delete_=True, type='AdamW', lr=2e-5, betas=(0.9, 0.999), weight_decay=0.05, | ||
constructor='LayerDecayOptimizerConstructor', | ||
paramwise_cfg=dict(num_layers=24, layer_decay_rate=0.90)) | ||
lr_config = dict(_delete_=True, | ||
policy='poly', | ||
warmup='linear', | ||
warmup_iters=1500, | ||
warmup_ratio=1e-6, | ||
power=1.0, min_lr=0.0, by_epoch=False) | ||
data = dict(samples_per_gpu=2, | ||
train=dict(pipeline=train_pipeline), | ||
val=dict(pipeline=test_pipeline), | ||
test=dict(pipeline=test_pipeline)) | ||
runner = dict(type='IterBasedRunner') | ||
checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1) | ||
evaluation = dict(interval=4000, metric='mIoU', save_best='mIoU') |
149 changes: 149 additions & 0 deletions
149
segmentation/configs/coco_stuff10k/mask2former_beit_adapter_large_512_40k_cocostuff10k_ss.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
# Copyright (c) Shanghai AI Lab. All rights reserved. | ||
_base_ = [ | ||
'../_base_/models/mask2former_beit_cocostuff.py', | ||
'../_base_/datasets/coco-stuff10k.py', | ||
'../_base_/default_runtime.py', | ||
'../_base_/schedules/schedule_40k.py' | ||
] | ||
crop_size = (512, 512) | ||
# pretrained = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth' | ||
pretrained = 'pretrained/beit_large_patch16_224_pt22k_ft22k.pth' | ||
model = dict( | ||
pretrained=pretrained, | ||
backbone=dict( | ||
type='BEiTAdapter', | ||
img_size=512, | ||
patch_size=16, | ||
embed_dim=1024, | ||
depth=24, | ||
num_heads=16, | ||
mlp_ratio=4, | ||
qkv_bias=True, | ||
use_abs_pos_emb=False, | ||
use_rel_pos_bias=True, | ||
init_values=1e-6, | ||
drop_path_rate=0.3, | ||
conv_inplane=64, | ||
n_points=4, | ||
deform_num_heads=16, | ||
cffn_ratio=0.25, | ||
deform_ratio=0.5, | ||
interaction_indexes=[[0, 5], [6, 11], [12, 17], [18, 23]], | ||
), | ||
decode_head=dict( | ||
in_channels=[1024, 1024, 1024, 1024], | ||
feat_channels=1024, | ||
out_channels=1024, | ||
num_queries=100, | ||
pixel_decoder=dict( | ||
type='MSDeformAttnPixelDecoder', | ||
num_outs=3, | ||
norm_cfg=dict(type='GN', num_groups=32), | ||
act_cfg=dict(type='ReLU'), | ||
encoder=dict( | ||
type='DetrTransformerEncoder', | ||
num_layers=6, | ||
transformerlayers=dict( | ||
type='BaseTransformerLayer', | ||
attn_cfgs=dict( | ||
type='MultiScaleDeformableAttention', | ||
embed_dims=1024, | ||
num_heads=32, | ||
num_levels=3, | ||
num_points=4, | ||
im2col_step=64, | ||
dropout=0.0, | ||
batch_first=False, | ||
norm_cfg=None, | ||
init_cfg=None), | ||
ffn_cfgs=dict( | ||
type='FFN', | ||
embed_dims=1024, | ||
feedforward_channels=4096, | ||
num_fcs=2, | ||
ffn_drop=0.0, | ||
act_cfg=dict(type='ReLU', inplace=True)), | ||
operation_order=('self_attn', 'norm', 'ffn', 'norm')), | ||
init_cfg=None), | ||
positional_encoding=dict( | ||
type='SinePositionalEncoding', num_feats=512, normalize=True), | ||
init_cfg=None), | ||
positional_encoding=dict( | ||
type='SinePositionalEncoding', num_feats=512, normalize=True), | ||
transformer_decoder=dict( | ||
type='DetrTransformerDecoder', | ||
return_intermediate=True, | ||
num_layers=9, | ||
transformerlayers=dict( | ||
type='DetrTransformerDecoderLayer', | ||
attn_cfgs=dict( | ||
type='MultiheadAttention', | ||
embed_dims=1024, | ||
num_heads=32, | ||
attn_drop=0.0, | ||
proj_drop=0.0, | ||
dropout_layer=None, | ||
batch_first=False), | ||
ffn_cfgs=dict( | ||
embed_dims=1024, | ||
feedforward_channels=4096, | ||
num_fcs=2, | ||
act_cfg=dict(type='ReLU', inplace=True), | ||
ffn_drop=0.0, | ||
dropout_layer=None, | ||
add_identity=True), | ||
feedforward_channels=4096, | ||
operation_order=('cross_attn', 'norm', 'self_attn', 'norm', | ||
'ffn', 'norm')), | ||
init_cfg=None) | ||
), | ||
test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)) | ||
) | ||
# dataset settings | ||
img_norm_cfg = dict( | ||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) | ||
train_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict(type='LoadAnnotations', reduce_zero_label=True), | ||
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), | ||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), | ||
dict(type='RandomFlip', prob=0.5), | ||
dict(type='PhotoMetricDistortion'), | ||
dict(type='Normalize', **img_norm_cfg), | ||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), | ||
dict(type='ToMask'), | ||
dict(type='DefaultFormatBundle'), | ||
dict(type='Collect', keys=['img', 'gt_semantic_seg', 'gt_masks', 'gt_labels']) | ||
] | ||
test_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict( | ||
type='MultiScaleFlipAug', | ||
img_scale=(2048, 512), | ||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], | ||
flip=False, | ||
transforms=[ | ||
dict(type='Resize', keep_ratio=True), | ||
dict(type='ResizeToMultiple', size_divisor=32), | ||
dict(type='RandomFlip'), | ||
dict(type='Normalize', **img_norm_cfg), | ||
dict(type='ImageToTensor', keys=['img']), | ||
dict(type='Collect', keys=['img']), | ||
]) | ||
] | ||
optimizer = dict(_delete_=True, type='AdamW', lr=2e-5, betas=(0.9, 0.999), weight_decay=0.05, | ||
constructor='LayerDecayOptimizerConstructor', | ||
paramwise_cfg=dict(num_layers=24, layer_decay_rate=0.90)) | ||
lr_config = dict(_delete_=True, | ||
policy='poly', | ||
warmup='linear', | ||
warmup_iters=1500, | ||
warmup_ratio=1e-6, | ||
power=1.0, min_lr=0.0, by_epoch=False) | ||
data = dict(samples_per_gpu=2, | ||
train=dict(pipeline=train_pipeline), | ||
val=dict(pipeline=test_pipeline), | ||
test=dict(pipeline=test_pipeline)) | ||
runner = dict(type='IterBasedRunner') | ||
checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=1) | ||
evaluation = dict(interval=4000, metric='mIoU', save_best='mIoU') |
Oops, something went wrong.