调试安卓 gles性能瓶颈

目录

原文请见:参考地址

使用mali offline shader compiler分析shader的性能瓶颈。

下载Arm Performance Studio

下载地址

编译Unity Shader


通常选择GLES3x。

You might need to select GLES3x, as this is the graphics API Mali works well with.

编译之后得到一个.shader文件,搜索#ifdef,可以分别把Vert和Frag下的代码粘贴到单独的文件中。

代码示例如下:

Vertex Shader: shader.vert

glsl 复制代码
//#ifdef VERTEX
#version 300 es
#define HLSLCC_ENABLE_UNIFORM_BUFFERS 1
#if HLSLCC_ENABLE_UNIFORM_BUFFERS
#define UNITY_UNIFORM
#else
#define UNITY_UNIFORM uniform
#endif
#define UNITY_SUPPORTS_UNIFORM_LOCATION 1
#if UNITY_SUPPORTS_UNIFORM_LOCATION
#define UNITY_LOCATION(x) layout(location = x)
#define UNITY_BINDING(x) layout(binding = x, std140)
#else
#define UNITY_LOCATION(x)
#define UNITY_BINDING(x) layout(std140)
#endif
uniform vec3 _WorldSpaceCameraPos;
uniform mediump vec4 unity_SHBr;
uniform mediump vec4 unity_SHBg;
uniform mediump vec4 unity_SHBb;
uniform mediump vec4 unity_SHC;
uniform vec4 hlslcc_mtx4x4unity_ObjectToWorld[4];
uniform vec4 hlslcc_mtx4x4unity_WorldToObject[4];
uniform vec4 hlslcc_mtx4x4unity_MatrixVP[4];
uniform vec4 _MainTex_ST;
uniform vec4 _DetailAlbedoMap_ST;
uniform mediump float _UVSec;
in highp vec4 in_POSITION0;
in mediump vec3 in_NORMAL0;
in highp vec2 in_TEXCOORD0;
in highp vec2 in_TEXCOORD1;
out highp vec4 vs_TEXCOORD0;
out highp vec4 vs_TEXCOORD1;
out highp vec4 vs_TEXCOORD2;
out highp vec4 vs_TEXCOORD3;
out highp vec4 vs_TEXCOORD4;
out mediump vec4 vs_TEXCOORD5;
out highp vec4 vs_TEXCOORD7;
out highp vec3 vs_TEXCOORD8;
vec4 u_xlat0;
mediump vec4 u_xlat16_0;
bool u_xlatb0;
vec4 u_xlat1;
mediump float u_xlat16_2;
mediump vec3 u_xlat16_3;
float u_xlat12;
void main()
{
u_xlat0 = in_POSITION0.yyyy * hlslcc_mtx4x4unity_ObjectToWorld[1];
u_xlat0 = hlslcc_mtx4x4unity_ObjectToWorld[0] * in_POSITION0.xxxx + u_xlat0;
u_xlat0 = hlslcc_mtx4x4unity_ObjectToWorld[2] * in_POSITION0.zzzz + u_xlat0;
u_xlat0 = u_xlat0 + hlslcc_mtx4x4unity_ObjectToWorld[3];
u_xlat1 = u_xlat0.yyyy * hlslcc_mtx4x4unity_MatrixVP[1];
u_xlat1 = hlslcc_mtx4x4unity_MatrixVP[0] * u_xlat0.xxxx + u_xlat1;
u_xlat1 = hlslcc_mtx4x4unity_MatrixVP[2] * u_xlat0.zzzz + u_xlat1;
gl_Position = hlslcc_mtx4x4unity_MatrixVP[3] * u_xlat0.wwww + u_xlat1;
#ifdef UNITY_ADRENO_ES3
u_xlatb0 = !!(_UVSec==0.0);
#else
u_xlatb0 = _UVSec==0.0;
#endif
u_xlat0.xy = (bool(u_xlatb0)) ? in_TEXCOORD0.xy : in_TEXCOORD1.xy;
vs_TEXCOORD0.zw = u_xlat0.xy * _DetailAlbedoMap_ST.xy + _DetailAlbedoMap_ST.zw;
vs_TEXCOORD0.xy = in_TEXCOORD0.xy * _MainTex_ST.xy + _MainTex_ST.zw;
u_xlat0.xyz = in_POSITION0.yyy * hlslcc_mtx4x4unity_ObjectToWorld[1].xyz;
u_xlat0.xyz = hlslcc_mtx4x4unity_ObjectToWorld[0].xyz * in_POSITION0.xxx + u_xlat0.xyz;
u_xlat0.xyz = hlslcc_mtx4x4unity_ObjectToWorld[2].xyz * in_POSITION0.zzz + u_xlat0.xyz;
u_xlat0.xyz = hlslcc_mtx4x4unity_ObjectToWorld[3].xyz * in_POSITION0.www + u_xlat0.xyz;
vs_TEXCOORD1.xyz = u_xlat0.xyz + (-_WorldSpaceCameraPos.xyz);
vs_TEXCOORD8.xyz = u_xlat0.xyz;
vs_TEXCOORD1.w = 0.0;
vs_TEXCOORD2 = vec4(0.0, 0.0, 0.0, 0.0);
vs_TEXCOORD3 = vec4(0.0, 0.0, 0.0, 0.0);
u_xlat0.x = dot(in_NORMAL0.xyz, hlslcc_mtx4x4unity_WorldToObject[0].xyz);
u_xlat0.y = dot(in_NORMAL0.xyz, hlslcc_mtx4x4unity_WorldToObject[1].xyz);
u_xlat0.z = dot(in_NORMAL0.xyz, hlslcc_mtx4x4unity_WorldToObject[2].xyz);
u_xlat12 = dot(u_xlat0.xyz, u_xlat0.xyz);
u_xlat12 = inversesqrt(u_xlat12);
u_xlat0.xyz = vec3(u_xlat12) * u_xlat0.xyz;
vs_TEXCOORD4.xyz = u_xlat0.xyz;
vs_TEXCOORD4.w = 0.0;
u_xlat16_2 = u_xlat0.y * u_xlat0.y;
u_xlat16_2 = u_xlat0.x * u_xlat0.x + (-u_xlat16_2);
u_xlat16_0 = u_xlat0.yzzx * u_xlat0.xyzz;
u_xlat16_3.x = dot(unity_SHBr, u_xlat16_0);
u_xlat16_3.y = dot(unity_SHBg, u_xlat16_0);
u_xlat16_3.z = dot(unity_SHBb, u_xlat16_0);
vs_TEXCOORD5.xyz = unity_SHC.xyz * vec3(u_xlat16_2) + u_xlat16_3.xyz;
vs_TEXCOORD5.w = 0.0;
vs_TEXCOORD7 = vec4(0.0, 0.0, 0.0, 0.0);
return;
}
//#endif

运行malios调试

进入到malios.exe 对应的目录,然后在cmd中可以分析瓶颈

示例如下,

复制代码
`C:\Users\rtorresb\Desktop\Tmp>malioc shader.vert
Mali Offline Compiler v7.1.0 (Build 7a3538)
Copyright 2007-2020 Arm Limited, all rights reserved
Configuration
=============
Hardware: Mali-G76 r0p0
Driver: Bifrost r19p0-00rel0
Shader type: OpenGL ES Vertex (inferred)
Main shader
===========
Work registers: 32
Uniform registers: 82
Stack spilling: False
A LS V T Bound
Total instruction cycles: 2.9 16.0 0.0 0.0 LS
Shortest path cycles: 2.9 16.0 0.0 0.0 LS
Longest path cycles: 2.9 16.0 0.0 0.0 LS
A = Arithmetic, LS = Load/Store, V = Varying, T = Texture

这个vert shader 计算的时钟周期是2.9,load/store的时钟周期是16.0 瓶颈是LS

fragment shader调试同理

用处和限制

Here are a few key lessons you can get from this post:

复制代码
Everything counts towards performance: instructions, texture channels, variants. Everything.
You have a neat tool to measure the cost of your shaders
And more importantly, you can now compare shaders' performance when in doubt
You are now a step closer to 60 FPS.

However, keep in mind:

复制代码
These estimates greatly vary across architectures and even driver versions...
Yet, these metrics will be incredibly useful for your optimization journey

比较重要的一点是该工具只支持基于mali架构的处理器,而且只是粗略估计,线上的真是环境比如缓存什么的,它就无法模拟。

相关推荐
阿巴斯甜14 分钟前
Android 报错:Zip file '/Users/lyy/develop/repoAndroidLapp/l-app-android-ble/app/bu
android
Kapaseker1 小时前
实战 Compose 中的 IntrinsicSize
android·kotlin
xq95272 小时前
Andorid Google 登录接入文档
android
黄林晴3 小时前
告别 Modifier 地狱,Compose 样式系统要变天了
android·android jetpack
冬奇Lab15 小时前
Android触摸事件分发、手势识别与输入优化实战
android·源码阅读
城东米粉儿18 小时前
Android MediaPlayer 笔记
android
Jony_19 小时前
Android 启动优化方案
android
阿巴斯甜19 小时前
Android studio 报错:Cause: error=86, Bad CPU type in executable
android
张小潇19 小时前
AOSP15 Input专题InputReader源码分析
android
_小马快跑_1 天前
Kotlin | 协程调度器选择:何时用CoroutineScope配置,何时用launch指定?
android