1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
import logging
import os
from collections import OrderedDict
import torch
from torch.nn.parallel import DistributedDataParallel
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer, PeriodicCheckpointer
from detectron2.config import get_cfg
from detectron2.data import (
MetadataCatalog,
build_detection_train_loader,
)
from detectron2.engine import default_argument_parser, default_setup, launch
from detectron2.modeling import build_model
from detectron2.solver import build_lr_scheduler, build_optimizer
from detectron2.utils.events import (
CommonMetricPrinter,
EventStorage,
JSONWriter,
TensorboardXWriter,
)
logger = logging.getLogger("detectron2")
def do_train(cfg, model, resume=False):
model.train()
optimizer = build_optimizer(cfg, model)
scheduler = build_lr_scheduler(cfg, optimizer)
checkpointer = DetectionCheckpointer(
model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
)
start_iter = (
checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
)
max_iter = cfg.SOLVER.MAX_ITER
periodic_checkpointer = PeriodicCheckpointer(
checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
)
writers = (
[
CommonMetricPrinter(max_iter),
JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
TensorboardXWriter(cfg.OUTPUT_DIR),
]
if comm.is_main_process()
else []
)
# compared to "train_net.py", we do not support accurate timing and
# precise BN here, because they are not trivial to implement
data_loader = build_detection_train_loader(cfg)
logger.info("Starting training from iteration {}".format(start_iter))
with EventStorage(start_iter) as storage:
for data, iteration in zip(data_loader, range(start_iter, max_iter)):
iteration = iteration + 1
storage.step()
loss_dict = model(data)
losses = sum(loss_dict.values())
#print(data)
#print(loss_dict)
assert torch.isfinite(losses).all(), loss_dict
loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
if comm.is_main_process():
storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)
optimizer.zero_grad()
losses.backward()
optimizer.step()
storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
scheduler.step()
if (
cfg.TEST.EVAL_PERIOD > 0
and iteration % cfg.TEST.EVAL_PERIOD == 0
and iteration != max_iter
):
# Compared to "train_net.py", the test results are not dumped to EventStorage
comm.synchronize()
if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
for writer in writers:
writer.write()
periodic_checkpointer.step(iteration)
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(
cfg, args
) # if you don't like any of the default setup, write your own setup code
return cfg
def main(args):
cfg = setup(args)
model = build_model(cfg)
logger.info("Model:\n{}".format(model))
do_train(cfg, model)
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)
|