# Copyright (c) 2026 The Qwen team, Alibaba Group. # Licensed under The MIT License [see LICENSE for details] import torch import tilelang def profile(func, inputs, wait: int = 50, warmup: int = 50, rep: int = 100): with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=rep), # on_trace_ready=torch.profiler.tensorboard_trace_handler('./tb'), ) as prof: for idx in range(wait + warmup + rep): func(*inputs) prof.step() # print(prof.key_averages().table(sort_by="cpu_time", row_limit=10)) result = {x.key: x.device_time * 1e-3 for x in prof.key_averages()} result["total"] = tilelang.profiler.do_bench( lambda: func(*inputs), warmup=warmup, rep=rep ) return result