调试安卓 gles性能瓶颈

目录

原文请见:参考地址

使用mali offline shader compiler分析shader的性能瓶颈。

下载Arm Performance Studio

下载地址

编译Unity Shader


通常选择GLES3x。

You might need to select GLES3x, as this is the graphics API Mali works well with.

编译之后得到一个.shader文件,搜索#ifdef,可以分别把Vert和Frag下的代码粘贴到单独的文件中。

代码示例如下:

Vertex Shader: shader.vert

glsl 复制代码
//#ifdef VERTEX
#version 300 es
#define HLSLCC_ENABLE_UNIFORM_BUFFERS 1
#if HLSLCC_ENABLE_UNIFORM_BUFFERS
#define UNITY_UNIFORM
#else
#define UNITY_UNIFORM uniform
#endif
#define UNITY_SUPPORTS_UNIFORM_LOCATION 1
#if UNITY_SUPPORTS_UNIFORM_LOCATION
#define UNITY_LOCATION(x) layout(location = x)
#define UNITY_BINDING(x) layout(binding = x, std140)
#else
#define UNITY_LOCATION(x)
#define UNITY_BINDING(x) layout(std140)
#endif
uniform vec3 _WorldSpaceCameraPos;
uniform mediump vec4 unity_SHBr;
uniform mediump vec4 unity_SHBg;
uniform mediump vec4 unity_SHBb;
uniform mediump vec4 unity_SHC;
uniform vec4 hlslcc_mtx4x4unity_ObjectToWorld[4];
uniform vec4 hlslcc_mtx4x4unity_WorldToObject[4];
uniform vec4 hlslcc_mtx4x4unity_MatrixVP[4];
uniform vec4 _MainTex_ST;
uniform vec4 _DetailAlbedoMap_ST;
uniform mediump float _UVSec;
in highp vec4 in_POSITION0;
in mediump vec3 in_NORMAL0;
in highp vec2 in_TEXCOORD0;
in highp vec2 in_TEXCOORD1;
out highp vec4 vs_TEXCOORD0;
out highp vec4 vs_TEXCOORD1;
out highp vec4 vs_TEXCOORD2;
out highp vec4 vs_TEXCOORD3;
out highp vec4 vs_TEXCOORD4;
out mediump vec4 vs_TEXCOORD5;
out highp vec4 vs_TEXCOORD7;
out highp vec3 vs_TEXCOORD8;
vec4 u_xlat0;
mediump vec4 u_xlat16_0;
bool u_xlatb0;
vec4 u_xlat1;
mediump float u_xlat16_2;
mediump vec3 u_xlat16_3;
float u_xlat12;
void main()
{
u_xlat0 = in_POSITION0.yyyy * hlslcc_mtx4x4unity_ObjectToWorld[1];
u_xlat0 = hlslcc_mtx4x4unity_ObjectToWorld[0] * in_POSITION0.xxxx + u_xlat0;
u_xlat0 = hlslcc_mtx4x4unity_ObjectToWorld[2] * in_POSITION0.zzzz + u_xlat0;
u_xlat0 = u_xlat0 + hlslcc_mtx4x4unity_ObjectToWorld[3];
u_xlat1 = u_xlat0.yyyy * hlslcc_mtx4x4unity_MatrixVP[1];
u_xlat1 = hlslcc_mtx4x4unity_MatrixVP[0] * u_xlat0.xxxx + u_xlat1;
u_xlat1 = hlslcc_mtx4x4unity_MatrixVP[2] * u_xlat0.zzzz + u_xlat1;
gl_Position = hlslcc_mtx4x4unity_MatrixVP[3] * u_xlat0.wwww + u_xlat1;
#ifdef UNITY_ADRENO_ES3
u_xlatb0 = !!(_UVSec==0.0);
#else
u_xlatb0 = _UVSec==0.0;
#endif
u_xlat0.xy = (bool(u_xlatb0)) ? in_TEXCOORD0.xy : in_TEXCOORD1.xy;
vs_TEXCOORD0.zw = u_xlat0.xy * _DetailAlbedoMap_ST.xy + _DetailAlbedoMap_ST.zw;
vs_TEXCOORD0.xy = in_TEXCOORD0.xy * _MainTex_ST.xy + _MainTex_ST.zw;
u_xlat0.xyz = in_POSITION0.yyy * hlslcc_mtx4x4unity_ObjectToWorld[1].xyz;
u_xlat0.xyz = hlslcc_mtx4x4unity_ObjectToWorld[0].xyz * in_POSITION0.xxx + u_xlat0.xyz;
u_xlat0.xyz = hlslcc_mtx4x4unity_ObjectToWorld[2].xyz * in_POSITION0.zzz + u_xlat0.xyz;
u_xlat0.xyz = hlslcc_mtx4x4unity_ObjectToWorld[3].xyz * in_POSITION0.www + u_xlat0.xyz;
vs_TEXCOORD1.xyz = u_xlat0.xyz + (-_WorldSpaceCameraPos.xyz);
vs_TEXCOORD8.xyz = u_xlat0.xyz;
vs_TEXCOORD1.w = 0.0;
vs_TEXCOORD2 = vec4(0.0, 0.0, 0.0, 0.0);
vs_TEXCOORD3 = vec4(0.0, 0.0, 0.0, 0.0);
u_xlat0.x = dot(in_NORMAL0.xyz, hlslcc_mtx4x4unity_WorldToObject[0].xyz);
u_xlat0.y = dot(in_NORMAL0.xyz, hlslcc_mtx4x4unity_WorldToObject[1].xyz);
u_xlat0.z = dot(in_NORMAL0.xyz, hlslcc_mtx4x4unity_WorldToObject[2].xyz);
u_xlat12 = dot(u_xlat0.xyz, u_xlat0.xyz);
u_xlat12 = inversesqrt(u_xlat12);
u_xlat0.xyz = vec3(u_xlat12) * u_xlat0.xyz;
vs_TEXCOORD4.xyz = u_xlat0.xyz;
vs_TEXCOORD4.w = 0.0;
u_xlat16_2 = u_xlat0.y * u_xlat0.y;
u_xlat16_2 = u_xlat0.x * u_xlat0.x + (-u_xlat16_2);
u_xlat16_0 = u_xlat0.yzzx * u_xlat0.xyzz;
u_xlat16_3.x = dot(unity_SHBr, u_xlat16_0);
u_xlat16_3.y = dot(unity_SHBg, u_xlat16_0);
u_xlat16_3.z = dot(unity_SHBb, u_xlat16_0);
vs_TEXCOORD5.xyz = unity_SHC.xyz * vec3(u_xlat16_2) + u_xlat16_3.xyz;
vs_TEXCOORD5.w = 0.0;
vs_TEXCOORD7 = vec4(0.0, 0.0, 0.0, 0.0);
return;
}
//#endif

运行malios调试

进入到malios.exe 对应的目录,然后在cmd中可以分析瓶颈

示例如下,

`C:\Users\rtorresb\Desktop\Tmp>malioc shader.vert
Mali Offline Compiler v7.1.0 (Build 7a3538)
Copyright 2007-2020 Arm Limited, all rights reserved
Configuration
=============
Hardware: Mali-G76 r0p0
Driver: Bifrost r19p0-00rel0
Shader type: OpenGL ES Vertex (inferred)
Main shader
===========
Work registers: 32
Uniform registers: 82
Stack spilling: False
A LS V T Bound
Total instruction cycles: 2.9 16.0 0.0 0.0 LS
Shortest path cycles: 2.9 16.0 0.0 0.0 LS
Longest path cycles: 2.9 16.0 0.0 0.0 LS
A = Arithmetic, LS = Load/Store, V = Varying, T = Texture

这个vert shader 计算的时钟周期是2.9,load/store的时钟周期是16.0 瓶颈是LS

fragment shader调试同理

用处和限制

Here are a few key lessons you can get from this post:

Everything counts towards performance: instructions, texture channels, variants. Everything.
You have a neat tool to measure the cost of your shaders
And more importantly, you can now compare shaders' performance when in doubt
You are now a step closer to 60 FPS.

However, keep in mind:

These estimates greatly vary across architectures and even driver versions...
Yet, these metrics will be incredibly useful for your optimization journey

比较重要的一点是该工具只支持基于mali架构的处理器,而且只是粗略估计,线上的真是环境比如缓存什么的,它就无法模拟。

相关推荐
H10017 分钟前
重构(二)
android·重构
拓端研究室28 分钟前
R基于贝叶斯加法回归树BART、MCMC的DLNM分布滞后非线性模型分析母婴PM2.5暴露与出生体重数据及GAM模型对比、关键窗口识别
android·开发语言·kotlin
zhangphil1 小时前
Android简洁缩放Matrix实现图像马赛克,Kotlin
android·kotlin
m0_512744641 小时前
极客大挑战2024-web-wp(详细)
android·前端
lw向北.2 小时前
Qt For Android之环境搭建(Qt 5.12.11 Qt下载SDK的处理方案)
android·开发语言·qt
不爱学习的啊Biao2 小时前
【13】MySQL如何选择合适的索引?
android·数据库·mysql
Clockwiseee2 小时前
PHP伪协议总结
android·开发语言·php
mmsx9 小时前
android sqlite 数据库简单封装示例(java)
android·java·数据库
众拾达人12 小时前
Android自动化测试实战 Java篇 主流工具 框架 脚本
android·java·开发语言
吃着火锅x唱着歌12 小时前
PHP7内核剖析 学习笔记 第四章 内存管理(1)
android·笔记·学习