テラByteの時代にキロByte

shader又はdemosceneに関係する事

Compute Shaderを書いてみた

pythonを使って、conpute shaderを書いてみた。
compute shaderでtextureを作り、fragment shaderでtextureとして貼り付けてます。とりあえず、変数をどう取り扱うかのテストを兼ねてます。

from OpenGL.GL import *
from OpenGL.WGL import *
from ctypes import *
from ctypes.wintypes import *
import sys

vsh = """
#version 430

void main()
{
    gl_Position = vec4(ivec2(gl_VertexID & 1, gl_VertexID >> 1 & 1)*2-1, 1, 1);
}
"""

fsh = """
#version 430

uniform vec2 resolution;
uniform sampler2D tex2d;

out vec4 fragColor;

void main()
{
    vec2 p = gl_FragCoord.xy/ resolution;
    fragColor = texture(tex2d, p);
}
"""

csh = """
#version 430
 
uniform vec2 resolution;
uniform float time;
writeonly uniform image2D destTex;
 
layout(local_size_x=16, local_size_y=16) in;
 
void main() {
    vec2 fragCoord = vec2(gl_GlobalInvocationID.xy);
    vec2 p = (fragCoord * 2.0 - resolution) / resolution.y;
    vec3 col= vec3(0.1);
    float de = abs(abs(p.x) + abs(p.y) - 1.0);
    col = vec3(mix(vec3(1,0.8,0.1),col,smoothstep(0.0,0.005,de)));
    imageStore( destTex, ivec2(gl_GlobalInvocationID.xy),  vec4(col,0) );
}
"""

winmm = windll.winmm 
kernel32 = windll.kernel32
user32 = windll.user32

XRES = 640
YRES = 480

WS_OVERLAPPEDWINDOW = 0xcf0000
WS_VISIBLE = 0x10000000
PM_REMOVE = 1
WM_NCLBUTTONDOWN = 161
HTCLOSE = 20
VK_ESCAPE = 27
PFD_SUPPORT_OPENGL = 32
PFD_DOUBLEBUFFER = 1

hWnd = user32.CreateWindowExA(0,0xC018,0,WS_OVERLAPPEDWINDOW|WS_VISIBLE,30,30,XRES,YRES,0,0,0,0)
hdc = user32.GetDC(hWnd)   
user32.SetForegroundWindow(hWnd)
pfd = PIXELFORMATDESCRIPTOR(0,1,PFD_SUPPORT_OPENGL|PFD_DOUBLEBUFFER,32,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,0,0,0,0,0)
SetPixelFormat(hdc, ChoosePixelFormat(hdc, pfd), pfd)
hGLrc = wglCreateContext(hdc)
wglMakeCurrent(hdc, hGLrc)

glClearColor(0, 0, 0, 1)
glEnable(GL_CULL_FACE)
glCullFace(GL_BACK)
glEnable(GL_DEPTH_TEST)
glDepthFunc(GL_LEQUAL)

global program
program = glCreateProgram()
for s, t in zip((vsh, fsh), (GL_VERTEX_SHADER, GL_FRAGMENT_SHADER)):    
    shader = glCreateShader(t)
    glShaderSource(shader, s)
    glCompileShader(shader)
    if glGetShaderiv(shader, GL_COMPILE_STATUS) != GL_TRUE:
        raise RuntimeError(glGetShaderInfoLog(shader).decode())
    glAttachShader(program, shader)
glLinkProgram(program)
glUseProgram(program)
glUniform2f(glGetUniformLocation(program, "resolution"), XRES , YRES)
    
computeProg = glCreateProgram()
shader = glCreateShader(GL_COMPUTE_SHADER)
glShaderSource(shader, csh)
glCompileShader(shader)
if glGetShaderiv(shader, GL_COMPILE_STATUS) != GL_TRUE:
    raise RuntimeError(glGetShaderInfoLog(shader).decode())
glAttachShader(computeProg, shader)
glLinkProgram(computeProg)
glUseProgram(computeProg)
glUniform2f(glGetUniformLocation(computeProg, "resolution"), XRES , YRES)
    
texture = glGenTextures(1)
glActiveTexture(GL_TEXTURE0)
glBindTexture(GL_TEXTURE_2D, texture)
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR)
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR)
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, XRES, YRES, 0, GL_RGBA, GL_FLOAT, None)
glBindImageTexture(0, texture, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA32F)

glUseProgram(computeProg);
glUniform1i(glGetUniformLocation(computeProg, "destTex"), 0)

glUseProgram(program);
glUniform1i(glGetUniformLocation(program, "tex2d"), 0)
    
duration = 60
msg = MSG()
lpmsg = pointer(msg)
zero = winmm.timeGetTime()
done = False
fps, cnt, s0 = 0, 0, 0
while done==False:
    while user32.PeekMessageA(lpmsg, 0, 0, 0, PM_REMOVE):
        if (msg.message == WM_NCLBUTTONDOWN and msg.wParam == HTCLOSE): done = True
        user32.DispatchMessageA(lpmsg)
    if(user32.GetAsyncKeyState(VK_ESCAPE)):  done = True
    t = (winmm.timeGetTime() - zero)*0.001

    glUseProgram(computeProg);
    glUniform1f(glGetUniformLocation(computeProg, "time"), t)
    glDispatchCompute(XRES//16, YRES//16, 1)

    glUseProgram(program);
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4)

    SwapBuffers(hdc)
    
    cnt += 1
    if (t - s0 > 1):
        fps = cnt      
        cnt = 0
        s0 = t
    sys.stdout.write("\r FPS : %d TIME : %f" %(fps,t))
    sys.stdout.flush()
    
    if (t > duration):  done = True
    
wglMakeCurrent(0, 0)
wglDeleteContext(hGLrc)
user32.ReleaseDC(hWnd, hdc)
user32.PostQuitMessage(0)
user32.DestroyWindow(hWnd)

compute shaderに登場してきたglDispatchCompute(XRES//16, YRES//16, 1)とshaderの中のlayout(local_size_x=16, local_size_y=16) in;について調べてみます。
この2つには関係を手探りで調べていきます。

glDispatchCompute(XRES//1 YRES//1, 1) -- layout(local_size_x=1, local_size_y=1) in; -- FPS 60
glDispatchCompute(XRES//16, YRES//16, 1) -- layout(local_size_x=16, local_size_y=16) in; -- FPS 60
glDispatchCompute(XRES//32, YRES//32, 1) -- layout(local_size_x=32, local_size_y=32) in; -- FPS 60

glDispatchCompute(XRES//32, YRES//32, 1) -- layout(local_size_x=16, local_size_y=16) in; -- 画面表示が4分の1

もうちょい、GPUの負荷が増えるソースを書いて、又試してみます。
画面で座標を取るのは、

vec2 fragCoord = vec2(gl_GlobalInvocationID.xy);
vec2 p = (fragCoord * 2.0 - resolution) / resolution.y;

で良さそう。
gl_GlobalInvocationIDuvec3みたい。

参考 OpenGL の ComputeShader

3D textureを試した奴