CPU profiling 也有类似边界。perf_sample、cpu_profile_stack_sample 是采样事实,不是执行全量记录;采样频率、采样窗口、unwind、symbolization 和 lost sample 统计都会影响结论。没有采样表,只能写“未采集 CPU profile”,不能写“没有 CPU 热点”。
SELECT'actual'AS table_name, COUNT(*) AS row_count FROM actual_frame_timeline_slice UNIONALL SELECT'expected'AS table_name, COUNT(*) AS row_count FROM expected_frame_timeline_slice;
INCLUDE PERFETTO MODULE slices.with_context; INCLUDE PERFETTO MODULE time.conversion;
SELECT 'app_marker'AS window_source, name AS window_name, ts AS start_ts, ts + dur AS end_ts, time_to_ms(ts) AS start_ms, time_to_ms(ts + dur) AS end_ms, time_to_ms(dur) AS duration_ms, process_name, thread_name FROM thread_slice WHERE process_name ='com.example.app' AND name ='feed_scroll' AND dur >0 ORDERBY ts LIMIT 10;
SELECT 'frame_timeline'AS window_source, name AS window_name, ts AS start_ts, ts + dur AS end_ts, time_to_ms(ts) AS start_ms, time_to_ms(ts + dur) AS end_ms, time_to_ms(dur) AS duration_ms FROM actual_frame_timeline_slice WHERE dur >0 ORDERBY dur DESC LIMIT 10;
WITH candidate_processes AS ( SELECT p.name AS process_name, p.upid, p.pid, COUNT(th.utid) AS thread_count, MAX(th.is_main_thread) AS has_main_thread, time_to_ms(MIN(th.start_ts)) AS first_thread_start_ms FROM process p JOIN thread th USING (upid) WHERE p.name GLOB 'com.example.app*' GROUPBY p.name, p.upid, p.pid ) SELECT cp.process_name, cp.upid, cp.pid, cp.thread_count, cp.has_main_thread, cp.first_thread_start_ms, th.name AS thread_name, th.utid, th.tid, th.is_main_thread FROM thread th JOIN candidate_processes cp USING (upid) ORDERBY cp.process_name, th.is_main_thread DESC, th.name;
INCLUDE PERFETTO MODULE slices.with_context; INCLUDE PERFETTO MODULE time.conversion;
WITH target_window(start_ts, end_ts) AS ( VALUES (123000000000, 123500000000) ), target_slices AS ( SELECT s.ts, s.dur, s.upid, s.utid, s.name, s.thread_name, s.process_name, MAX(0, MIN(s.ts + s.dur, w.end_ts) -MAX(s.ts, w.start_ts)) AS overlap_dur FROM thread_slice s JOIN target_window w WHERE s.process_name ='com.example.app' AND (s.is_main_thread =1OR s.thread_name ='RenderThread') AND s.dur >0 AND s.ts < w.end_ts AND s.ts + s.dur > w.start_ts ) SELECT upid, utid, time_to_ms(ts) AS ts_ms, time_to_ms(overlap_dur) AS overlap_ms, time_to_ms(dur) AS original_dur_ms, name, thread_name, process_name FROM target_slices WHERE overlap_dur > time_from_ms(5) ORDERBY overlap_ms DESC;
WITH target_window(start_ts, end_ts) AS ( VALUES (123000000000, 123500000000) ), target_threads AS ( SELECT th.utid FROM thread th JOIN process p USING (upid) WHERE p.name ='com.example.app' AND (th.is_main_thread =1OR th.name ='RenderThread') ) SELECT'thread_state_rows'AS check_name, COUNT(*) AS row_count FROM thread_state ts JOIN target_threads t USING (utid) JOIN target_window w WHERE ts.ts < w.end_ts AND ts.ts + ts.dur > w.start_ts UNIONALL SELECT'sched_rows'AS check_name, COUNT(*) AS row_count FROM sched s JOIN target_threads t USING (utid) JOIN target_window w WHERE s.ts < w.end_ts AND s.ts + s.dur > w.start_ts;
WITH target_window(start_ts, end_ts) AS ( VALUES (123000000000, 123500000000) ), target_threads AS ( SELECT th.utid, p.name AS process_name, th.name AS thread_name FROM thread th JOIN process p USING (upid) WHERE p.name ='com.example.app' AND (th.is_main_thread =1OR th.name ='RenderThread') ), state_in_window AS ( SELECT t.process_name, t.thread_name, CASE WHEN ts.state ='Running'THEN'running' WHEN ts.state GLOB 'R*'THEN'runnable' WHEN ts.state ='S'THEN'interruptible_sleep' WHEN ts.state ='D'THEN'uninterruptible_blocked' ELSE'other_'|| ts.state ENDAS state_bucket, MAX(0, MIN(ts.ts + ts.dur, w.end_ts) -MAX(ts.ts, w.start_ts)) AS overlap_dur FROM thread_state ts JOIN target_threads t USING (utid) JOIN target_window w WHERE ts.dur >0 AND ts.ts < w.end_ts AND ts.ts + ts.dur > w.start_ts ) SELECT process_name, thread_name, state_bucket, time_to_ms(SUM(overlap_dur)) AS overlap_ms FROM state_in_window GROUPBY process_name, thread_name, state_bucket ORDERBY thread_name, overlap_ms DESC;
S 常见于正常 sleep 或 futex 等待,D 更接近不可中断等待;需要定责时再结合 thread_state 的 blocked reason、io_wait、waker 字段和 schema 版本确认。点击响应、滑动首帧、启动首帧这类场景里,长 Runnable 只能说明 CPU 竞争或调度延迟也要一起看。它还不能单独定责:线程 running time 是线程维度,CPU busy 和 cluster 解释要另查 sched.cpu、CPU idle、CPU frequency、capacity,并限制在同一窗口内。
from perfetto.trace_processor import TraceProcessor
APP_PROCESS = "com.example.app"
QUALITY_DETAIL_QUERY = """ SELECT name, idx, severity, source, value FROM stats WHERE value != 0 AND ( severity IN ('error', 'data_loss') OR LOWER(name) LIKE '%loss%' OR LOWER(name) LIKE '%drop%' OR LOWER(name) LIKE '%parse%' ) ORDER BY severity, source, name, idx; """
FRAME_TIMELINE_QUERY = """ SELECT (SELECT COUNT(*) FROM actual_frame_timeline_slice) AS actual_count, (SELECT COUNT(*) FROM expected_frame_timeline_slice) AS expected_count; """
SLOW_SLICE_QUERY_TEMPLATE = """ INCLUDE PERFETTO MODULE slices.with_context; INCLUDE PERFETTO MODULE time.conversion; WITH target_window(start_ts, end_ts) AS ( VALUES ({start_ts}, {end_ts}) ), target_slices AS ( SELECT s.process_name, s.upid, s.thread_name, s.utid, s.name AS slice_name, MAX(0, MIN(s.ts + s.dur, w.end_ts) - MAX(s.ts, w.start_ts)) AS overlap_dur FROM thread_slice s JOIN target_window w WHERE s.process_name = {process_name} AND (s.is_main_thread = 1 OR s.thread_name = 'RenderThread') AND s.dur > 0 AND s.ts < w.end_ts AND s.ts + s.dur > w.start_ts ) SELECT process_name, upid, thread_name, utid, slice_name, COUNT(*) AS slice_count, time_to_ms(SUM(overlap_dur)) AS overlap_total_ms, time_to_ms(MAX(overlap_dur)) AS max_overlap_ms FROM target_slices WHERE overlap_dur > time_from_ms(5) GROUP BY process_name, upid, thread_name, utid, slice_name ORDER BY overlap_total_ms DESC LIMIT 20; """
STATE_QUERY_TEMPLATE = """ INCLUDE PERFETTO MODULE time.conversion; WITH target_window(start_ts, end_ts) AS ( VALUES ({start_ts}, {end_ts}) ), target_threads AS ( SELECT th.utid, p.name AS process_name, th.name AS thread_name FROM thread th JOIN process p USING (upid) WHERE p.name = {process_name} AND (th.is_main_thread = 1 OR th.name = 'RenderThread') ), state_in_window AS ( SELECT t.process_name, t.thread_name, ts.utid, CASE WHEN ts.state = 'Running' THEN 'running_ms' WHEN ts.state GLOB 'R*' THEN 'runnable_ms' WHEN ts.state = 'S' THEN 'sleeping_ms' WHEN ts.state = 'D' THEN 'uninterruptible_blocked_ms' ELSE 'other_state_ms' END AS state_bucket, MAX(0, MIN(ts.ts + ts.dur, w.end_ts) - MAX(ts.ts, w.start_ts)) AS overlap_dur FROM thread_state ts JOIN target_threads t USING (utid) JOIN target_window w WHERE ts.dur > 0 AND ts.ts < w.end_ts AND ts.ts + ts.dur > w.start_ts ) SELECT process_name, thread_name, utid, state_bucket, time_to_ms(SUM(overlap_dur)) AS overlap_ms FROM state_in_window GROUP BY process_name, thread_name, utid, state_bucket; """
import glob from perfetto.batch_trace_processor.api import BatchTraceProcessor
QUERY = """ INCLUDE PERFETTO MODULE slices.with_context; INCLUDE PERFETTO MODULE time.conversion; WITH target_window(start_ts, end_ts) AS ( VALUES (123000000000, 123500000000) ), target_slices AS ( SELECT process_name, upid, thread_name, utid, name AS slice_name, MAX(0, MIN(ts + dur, w.end_ts) - MAX(ts, w.start_ts)) AS overlap_dur FROM thread_slice JOIN target_window w WHERE process_name = 'com.example.app' AND (is_main_thread = 1 OR thread_name = 'RenderThread') AND ts < w.end_ts AND ts + dur > w.start_ts ) SELECT process_name, upid, thread_name, utid, slice_name, COUNT(*) AS slice_count, time_to_ms(SUM(overlap_dur)) AS overlap_total_ms FROM target_slices WHERE overlap_dur > time_from_ms(5) GROUP BY process_name, upid, thread_name, utid, slice_name ORDER BY overlap_total_ms DESC LIMIT 20; """
files = glob.glob("traces/*.perfetto-trace")
with BatchTraceProcessor(files) as btp: result = btp.query_and_flatten(QUERY) print(result)
还有一条容易被忽略:指标要写出失败时的解释口径。比如“5ms 以上主线程 slice 总耗时”只能回答切片里有多少长任务,它不能独立证明卡顿;“Runnable 总耗时”只能说明线程有等待 CPU 的迹象,它也不能替代 CPU 频率、负载和调度器证据。FrameTimeline 缺失时,报告应写 metric_status=unavailable、fallback_used=false、degrade_reason=missing_frame_timeline;ftrace loss 命中时,调度类指标应写 sched_evidence_grade=weak。
同一目标窗口里的 CPU frequency、CPU idle、thermal、cluster/CPU placement 更适合放在报告的“解释变量”里。它们通常不单独定责,但能解释为什么同样的业务切片在某次 Trace 里变慢。
多 Trace 才能回答回归问题
单份 Trace 适合定位一个现场,多份 Trace 才能回答“优化有没有稳定变好”。很多性能报告的问题不在 Trace 本身,而在证据组织:截图很多,结论下得重,但没有固定配置、没有样本数、没有 data loss 检查、没有 before/after 统计口径。