赞
踩
原文请见:参考地址
使用mali offline shader compiler分析shader的性能瓶颈。
通常选择GLES3x。
You might need to select GLES3x, as this is the graphics API Mali works well with.
编译之后得到一个.shader文件,搜索#ifdef
,可以分别把Vert和Frag下的代码粘贴到单独的文件中。
代码示例如下:
Vertex Shader: shader.vert
//#ifdef VERTEX #version 300 es #define HLSLCC_ENABLE_UNIFORM_BUFFERS 1 #if HLSLCC_ENABLE_UNIFORM_BUFFERS #define UNITY_UNIFORM #else #define UNITY_UNIFORM uniform #endif #define UNITY_SUPPORTS_UNIFORM_LOCATION 1 #if UNITY_SUPPORTS_UNIFORM_LOCATION #define UNITY_LOCATION(x) layout(location = x) #define UNITY_BINDING(x) layout(binding = x, std140) #else #define UNITY_LOCATION(x) #define UNITY_BINDING(x) layout(std140) #endif uniform vec3 _WorldSpaceCameraPos; uniform mediump vec4 unity_SHBr; uniform mediump vec4 unity_SHBg; uniform mediump vec4 unity_SHBb; uniform mediump vec4 unity_SHC; uniform vec4 hlslcc_mtx4x4unity_ObjectToWorld[4]; uniform vec4 hlslcc_mtx4x4unity_WorldToObject[4]; uniform vec4 hlslcc_mtx4x4unity_MatrixVP[4]; uniform vec4 _MainTex_ST; uniform vec4 _DetailAlbedoMap_ST; uniform mediump float _UVSec; in highp vec4 in_POSITION0; in mediump vec3 in_NORMAL0; in highp vec2 in_TEXCOORD0; in highp vec2 in_TEXCOORD1; out highp vec4 vs_TEXCOORD0; out highp vec4 vs_TEXCOORD1; out highp vec4 vs_TEXCOORD2; out highp vec4 vs_TEXCOORD3; out highp vec4 vs_TEXCOORD4; out mediump vec4 vs_TEXCOORD5; out highp vec4 vs_TEXCOORD7; out highp vec3 vs_TEXCOORD8; vec4 u_xlat0; mediump vec4 u_xlat16_0; bool u_xlatb0; vec4 u_xlat1; mediump float u_xlat16_2; mediump vec3 u_xlat16_3; float u_xlat12; void main() { u_xlat0 = in_POSITION0.yyyy * hlslcc_mtx4x4unity_ObjectToWorld[1]; u_xlat0 = hlslcc_mtx4x4unity_ObjectToWorld[0] * in_POSITION0.xxxx + u_xlat0; u_xlat0 = hlslcc_mtx4x4unity_ObjectToWorld[2] * in_POSITION0.zzzz + u_xlat0; u_xlat0 = u_xlat0 + hlslcc_mtx4x4unity_ObjectToWorld[3]; u_xlat1 = u_xlat0.yyyy * hlslcc_mtx4x4unity_MatrixVP[1]; u_xlat1 = hlslcc_mtx4x4unity_MatrixVP[0] * u_xlat0.xxxx + u_xlat1; u_xlat1 = hlslcc_mtx4x4unity_MatrixVP[2] * u_xlat0.zzzz + u_xlat1; gl_Position = hlslcc_mtx4x4unity_MatrixVP[3] * u_xlat0.wwww + u_xlat1; #ifdef UNITY_ADRENO_ES3 u_xlatb0 = !!(_UVSec==0.0); #else u_xlatb0 = _UVSec==0.0; #endif u_xlat0.xy = (bool(u_xlatb0)) ? in_TEXCOORD0.xy : in_TEXCOORD1.xy; vs_TEXCOORD0.zw = u_xlat0.xy * _DetailAlbedoMap_ST.xy + _DetailAlbedoMap_ST.zw; vs_TEXCOORD0.xy = in_TEXCOORD0.xy * _MainTex_ST.xy + _MainTex_ST.zw; u_xlat0.xyz = in_POSITION0.yyy * hlslcc_mtx4x4unity_ObjectToWorld[1].xyz; u_xlat0.xyz = hlslcc_mtx4x4unity_ObjectToWorld[0].xyz * in_POSITION0.xxx + u_xlat0.xyz; u_xlat0.xyz = hlslcc_mtx4x4unity_ObjectToWorld[2].xyz * in_POSITION0.zzz + u_xlat0.xyz; u_xlat0.xyz = hlslcc_mtx4x4unity_ObjectToWorld[3].xyz * in_POSITION0.www + u_xlat0.xyz; vs_TEXCOORD1.xyz = u_xlat0.xyz + (-_WorldSpaceCameraPos.xyz); vs_TEXCOORD8.xyz = u_xlat0.xyz; vs_TEXCOORD1.w = 0.0; vs_TEXCOORD2 = vec4(0.0, 0.0, 0.0, 0.0); vs_TEXCOORD3 = vec4(0.0, 0.0, 0.0, 0.0); u_xlat0.x = dot(in_NORMAL0.xyz, hlslcc_mtx4x4unity_WorldToObject[0].xyz); u_xlat0.y = dot(in_NORMAL0.xyz, hlslcc_mtx4x4unity_WorldToObject[1].xyz); u_xlat0.z = dot(in_NORMAL0.xyz, hlslcc_mtx4x4unity_WorldToObject[2].xyz); u_xlat12 = dot(u_xlat0.xyz, u_xlat0.xyz); u_xlat12 = inversesqrt(u_xlat12); u_xlat0.xyz = vec3(u_xlat12) * u_xlat0.xyz; vs_TEXCOORD4.xyz = u_xlat0.xyz; vs_TEXCOORD4.w = 0.0; u_xlat16_2 = u_xlat0.y * u_xlat0.y; u_xlat16_2 = u_xlat0.x * u_xlat0.x + (-u_xlat16_2); u_xlat16_0 = u_xlat0.yzzx * u_xlat0.xyzz; u_xlat16_3.x = dot(unity_SHBr, u_xlat16_0); u_xlat16_3.y = dot(unity_SHBg, u_xlat16_0); u_xlat16_3.z = dot(unity_SHBb, u_xlat16_0); vs_TEXCOORD5.xyz = unity_SHC.xyz * vec3(u_xlat16_2) + u_xlat16_3.xyz; vs_TEXCOORD5.w = 0.0; vs_TEXCOORD7 = vec4(0.0, 0.0, 0.0, 0.0); return; } //#endif
进入到malios.exe 对应的目录,然后在cmd中可以分析瓶颈
示例如下,
`C:\Users\rtorresb\Desktop\Tmp>malioc shader.vert Mali Offline Compiler v7.1.0 (Build 7a3538) Copyright 2007-2020 Arm Limited, all rights reserved Configuration ============= Hardware: Mali-G76 r0p0 Driver: Bifrost r19p0-00rel0 Shader type: OpenGL ES Vertex (inferred) Main shader =========== Work registers: 32 Uniform registers: 82 Stack spilling: False A LS V T Bound Total instruction cycles: 2.9 16.0 0.0 0.0 LS Shortest path cycles: 2.9 16.0 0.0 0.0 LS Longest path cycles: 2.9 16.0 0.0 0.0 LS A = Arithmetic, LS = Load/Store, V = Varying, T = Texture
这个vert shader 计算的时钟周期是2.9,load/store的时钟周期是16.0 瓶颈是LS
fragment shader调试同理
Here are a few key lessons you can get from this post:
Everything counts towards performance: instructions, texture channels, variants. Everything.
You have a neat tool to measure the cost of your shaders
And more importantly, you can now compare shaders’ performance when in doubt
You are now a step closer to 60 FPS.
However, keep in mind:
These estimates greatly vary across architectures and even driver versions…
Yet, these metrics will be incredibly useful for your optimization journey
比较重要的一点是该工具只支持基于mali架构的处理器,而且只是粗略估计,线上的真是环境比如缓存什么的,它就无法模拟。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。