[GH-ISSUE #7748] ggml.c:4044: GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)) failed #4947

Open
opened 2026-04-12 16:00:23 -05:00 by GiteaMirror · 9 comments
Owner

Originally created by @pavelruzicka on GitHub (Nov 19, 2024).
Original GitHub issue: https://github.com/ollama/ollama/issues/7748

What is the issue?

On certain API requests, the server throws a segmentation fault error and the API responds with a HTTP 500. So far, I have encountered this twice in thousands of requests. Unfortunately I do not have the particular prompts that resulted in this logged but I do not expect this to be directly reproducible based on a prompt.

Full stack trace:

ggml.c:4044: GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)) failed
SIGSEGV: segmentation violation
PC=0x7ae06884d1d7 m=4 sigcode=1 addr=0x204803fbc
signal arrived during cgo execution

goroutine 7 gp=0xc000156000 m=4 mp=0xc00004d808 [syscall]:
runtime.cgocall(0x5bb738602e90, 0xc000056b60)
        runtime/cgocall.go:157 +0x4b fp=0xc000056b38 sp=0xc000056b00 pc=0x5bb7383853cb
github.com/ollama/ollama/llama._Cfunc_llama_decode(0x7adfec006460, {0x1, 0x7adfec3acb70, 0x0, 0x0, 0x7adfec3ad380, 0x7adfec3adb90, 0x7adfec17b380, 0x7adfd10c2910,
0x0, ...})
        _cgo_gotypes.go:543 +0x52 fp=0xc000056b60 sp=0xc000056b38 pc=0x5bb738482952
github.com/ollama/ollama/llama.(*Context).Decode.func1(0x5bb7385fed4b?, 0x7adfec006460?)
        github.com/ollama/ollama/llama/llama.go:167 +0xd8 fp=0xc000056c80 sp=0xc000056b60 pc=0x5bb738484e78
github.com/ollama/ollama/llama.(*Context).Decode(0xc000056d68?, 0x1?)
        github.com/ollama/ollama/llama/llama.go:167 +0x17 fp=0xc000056cc8 sp=0xc000056c80 pc=0x5bb738484cd7
main.(*Server).processBatch(0xc000128120, 0xc000126150, 0xc0001261c0)
        github.com/ollama/ollama/llama/runner/runner.go:424 +0x29e fp=0xc000056ed0 sp=0xc000056cc8 pc=0x5bb7385fdd7e
main.(*Server).run(0xc000128120, {0x5bb73893ca40, 0xc00007c050})
        github.com/ollama/ollama/llama/runner/runner.go:338 +0x1a5 fp=0xc000056fb8 sp=0xc000056ed0 pc=0x5bb7385fd765
main.main.gowrap2()
        github.com/ollama/ollama/llama/runner/runner.go:901 +0x28 fp=0xc000056fe0 sp=0xc000056fb8 pc=0x5bb738601ec8
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc000056fe8 sp=0xc000056fe0 pc=0x5bb7383edde1
created by main.main in goroutine 1
        github.com/ollama/ollama/llama/runner/runner.go:901 +0xc2b

goroutine 1 gp=0xc0000061c0 m=nil [IO wait, 520 minutes]:
runtime.gopark(0xc000038a08?, 0xc00014b908?, 0xb1?, 0x7a?, 0x2000?)
        runtime/proc.go:402 +0xce fp=0xc00014b888 sp=0xc00014b868 pc=0x5bb7383bc00e
runtime.netpollblock(0xc00014b920?, 0x38384b26?, 0xb7?)
        runtime/netpoll.go:573 +0xf7 fp=0xc00014b8c0 sp=0xc00014b888 pc=0x5bb7383b4257
internal/poll.runtime_pollWait(0x7ae067dc7fe0, 0x72)
        runtime/netpoll.go:345 +0x85 fp=0xc00014b8e0 sp=0xc00014b8c0 pc=0x5bb7383e8aa5
internal/poll.(*pollDesc).wait(0x3?, 0x7c?, 0x0)
        internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00014b908 sp=0xc00014b8e0 pc=0x5bb7384389c7
internal/poll.(*pollDesc).waitRead(...)
        internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Accept(0xc000150080)
        internal/poll/fd_unix.go:611 +0x2ac fp=0xc00014b9b0 sp=0xc00014b908 pc=0x5bb738439e8c
net.(*netFD).accept(0xc000150080)
        net/fd_unix.go:172 +0x29 fp=0xc00014ba68 sp=0xc00014b9b0 pc=0x5bb7384a88a9
net.(*TCPListener).accept(0xc0000321e0)
        net/tcpsock_posix.go:159 +0x1e fp=0xc00014ba90 sp=0xc00014ba68 pc=0x5bb7384b95de
net.(*TCPListener).Accept(0xc0000321e0)
        net/tcpsock.go:327 +0x30 fp=0xc00014bac0 sp=0xc00014ba90 pc=0x5bb7384b8930
net/http.(*onceCloseListener).Accept(0xc000190090?)
        <autogenerated>:1 +0x24 fp=0xc00014bad8 sp=0xc00014bac0 pc=0x5bb7385dfa44
net/http.(*Server).Serve(0xc000168000, {0x5bb73893c400, 0xc0000321e0})
        net/http/server.go:3260 +0x33e fp=0xc00014bc08 sp=0xc00014bad8 pc=0x5bb7385d685e
main.main()
        github.com/ollama/ollama/llama/runner/runner.go:921 +0xfcc fp=0xc00014bf50 sp=0xc00014bc08 pc=0x5bb738601c4c
runtime.main()
        runtime/proc.go:271 +0x29d fp=0xc00014bfe0 sp=0xc00014bf50 pc=0x5bb7383bbbdd
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc00014bfe8 sp=0xc00014bfe0 pc=0x5bb7383edde1

goroutine 2 gp=0xc000006c40 m=nil [force gc (idle), 3 minutes]:
runtime.gopark(0x1dd19be52e23?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc000046fa8 sp=0xc000046f88 pc=0x5bb7383bc00e
runtime.goparkunlock(...)
        runtime/proc.go:408
runtime.forcegchelper()
        runtime/proc.go:326 +0xb8 fp=0xc000046fe0 sp=0xc000046fa8 pc=0x5bb7383bbe98
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc000046fe8 sp=0xc000046fe0 pc=0x5bb7383edde1
created by runtime.init.6 in goroutine 1
        runtime/proc.go:314 +0x1a

goroutine 3 gp=0xc000007180 m=nil [GC sweep wait]:
runtime.gopark(0x5bb738b09e01?, 0x5bb738b09e40?, 0xc?, 0x9?, 0x1?)
        runtime/proc.go:402 +0xce fp=0xc000047780 sp=0xc000047760 pc=0x5bb7383bc00e
runtime.goparkunlock(...)
        runtime/proc.go:408
runtime.bgsweep(0xc00006e000)
        runtime/mgcsweep.go:318 +0xdf fp=0xc0000477c8 sp=0xc000047780 pc=0x5bb7383a6b9f
runtime.gcenable.gowrap1()
        runtime/mgc.go:203 +0x25 fp=0xc0000477e0 sp=0xc0000477c8 pc=0x5bb73839b685
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000477e8 sp=0xc0000477e0 pc=0x5bb7383edde1
created by runtime.gcenable in goroutine 1
        runtime/mgc.go:203 +0x66

goroutine 4 gp=0xc000007340 m=nil [GC scavenge wait]:
runtime.gopark(0x10000?, 0x166b9ea?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc000047f78 sp=0xc000047f58 pc=0x5bb7383bc00e
runtime.goparkunlock(...)
        runtime/proc.go:408
runtime.(*scavengerState).park(0x5bb738b0a4c0)
        runtime/mgcscavenge.go:425 +0x49 fp=0xc000047fa8 sp=0xc000047f78 pc=0x5bb7383a4549
runtime.bgscavenge(0xc00006e000)
        runtime/mgcscavenge.go:658 +0x59 fp=0xc000047fc8 sp=0xc000047fa8 pc=0x5bb7383a4af9
runtime.gcenable.gowrap2()
        runtime/mgc.go:204 +0x25 fp=0xc000047fe0 sp=0xc000047fc8 pc=0x5bb73839b625
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc000047fe8 sp=0xc000047fe0 pc=0x5bb7383edde1
created by runtime.gcenable in goroutine 1
        runtime/mgc.go:204 +0xa5

goroutine 5 gp=0xc000007c00 m=nil [finalizer wait, 3 minutes]:
runtime.gopark(0x0?, 0x5bb7389381a0?, 0x0?, 0x60?, 0x1000000010?)
        runtime/proc.go:402 +0xce fp=0xc000046620 sp=0xc000046600 pc=0x5bb7383bc00e
runtime.runfinq()
        runtime/mfinal.go:194 +0x107 fp=0xc0000467e0 sp=0xc000046620 pc=0x5bb73839a6c7
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000467e8 sp=0xc0000467e0 pc=0x5bb7383edde1
created by runtime.createfing in goroutine 1
        runtime/mfinal.go:164 +0x3d

goroutine 22 gp=0xc000196000 m=nil [select]:
runtime.gopark(0xc000147a80?, 0x2?, 0x18?, 0x77?, 0xc000147824?)
        runtime/proc.go:402 +0xce fp=0xc000147698 sp=0xc000147678 pc=0x5bb7383bc00e
runtime.selectgo(0xc000147a80, 0xc000147820, 0xc00037de00?, 0x0, 0x2?, 0x1)
        runtime/select.go:327 +0x725 fp=0xc0001477b8 sp=0xc000147698 pc=0x5bb7383cd3e5
main.(*Server).completion(0xc000128120, {0x5bb73893c5b0, 0xc0000e22a0}, 0xc0000c0360)
        github.com/ollama/ollama/llama/runner/runner.go:652 +0x8fe fp=0xc000147ab8 sp=0xc0001477b8 pc=0x5bb7385ff6de
main.(*Server).completion-fm({0x5bb73893c5b0?, 0xc0000e22a0?}, 0x5bb7385dab8d?)
        <autogenerated>:1 +0x36 fp=0xc000147ae8 sp=0xc000147ab8 pc=0x5bb7386026b6
net/http.HandlerFunc.ServeHTTP(0xc00010cb60?, {0x5bb73893c5b0?, 0xc0000e22a0?}, 0x10?)
        net/http/server.go:2171 +0x29 fp=0xc000147b10 sp=0xc000147ae8 pc=0x5bb7385d3629
net/http.(*ServeMux).ServeHTTP(0x5bb73838ef85?, {0x5bb73893c5b0, 0xc0000e22a0}, 0xc0000c0360)
        net/http/server.go:2688 +0x1ad fp=0xc000147b60 sp=0xc000147b10 pc=0x5bb7385d54ad
net/http.serverHandler.ServeHTTP({0x5bb73893b900?}, {0x5bb73893c5b0?, 0xc0000e22a0?}, 0x6?)
        net/http/server.go:3142 +0x8e fp=0xc000147b90 sp=0xc000147b60 pc=0x5bb7385d64ce
net/http.(*conn).serve(0xc000190090, {0x5bb73893ca08, 0xc00010adb0})
        net/http/server.go:2044 +0x5e8 fp=0xc000147fb8 sp=0xc000147b90 pc=0x5bb7385d2268
net/http.(*Server).Serve.gowrap3()
        net/http/server.go:3290 +0x28 fp=0xc000147fe0 sp=0xc000147fb8 pc=0x5bb7385d6c48
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc000147fe8 sp=0xc000147fe0 pc=0x5bb7383edde1
created by net/http.(*Server).Serve in goroutine 1
        net/http/server.go:3290 +0x4b4

goroutine 21 gp=0xc000082a80 m=nil [GC worker (idle), 4 minutes]:
runtime.gopark(0x1db5aaee6275?, 0x3?, 0x58?, 0xf?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc0000cdf50 sp=0xc0000cdf30 pc=0x5bb7383bc00e
runtime.gcBgMarkWorker()
        runtime/mgc.go:1310 +0xe5 fp=0xc0000cdfe0 sp=0xc0000cdf50 pc=0x5bb73839d585
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cdfe8 sp=0xc0000cdfe0 pc=0x5bb7383edde1
created by runtime.gcBgMarkStartWorkers in goroutine 18
        runtime/mgc.go:1234 +0x1c

goroutine 41 gp=0xc000082fc0 m=nil [GC worker (idle), 3 minutes]:
runtime.gopark(0x1db5aaee606c?, 0x3?, 0xc?, 0xfe?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc0000cf750 sp=0xc0000cf730 pc=0x5bb7383bc00e
runtime.gcBgMarkWorker()
        runtime/mgc.go:1310 +0xe5 fp=0xc0000cf7e0 sp=0xc0000cf750 pc=0x5bb73839d585
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cf7e8 sp=0xc0000cf7e0 pc=0x5bb7383edde1
created by runtime.gcBgMarkStartWorkers in goroutine 18
        runtime/mgc.go:1234 +0x1c

goroutine 50 gp=0xc0005e2000 m=nil [GC worker (idle), 3 minutes]:
runtime.gopark(0x1dd19bf2c88b?, 0x3?, 0xbf?, 0x40?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc0000c8750 sp=0xc0000c8730 pc=0x5bb7383bc00e
runtime.gcBgMarkWorker()
        runtime/mgc.go:1310 +0xe5 fp=0xc0000c87e0 sp=0xc0000c8750 pc=0x5bb73839d585
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000c87e8 sp=0xc0000c87e0 pc=0x5bb7383edde1
created by runtime.gcBgMarkStartWorkers in goroutine 18
        runtime/mgc.go:1234 +0x1c

goroutine 42 gp=0xc000083180 m=nil [GC worker (idle), 66 minutes]:
runtime.gopark(0x1a5228359582?, 0x3?, 0x1c?, 0x4?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc0000cff50 sp=0xc0000cff30 pc=0x5bb7383bc00e
runtime.gcBgMarkWorker()
        runtime/mgc.go:1310 +0xe5 fp=0xc0000cffe0 sp=0xc0000cff50 pc=0x5bb73839d585
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cffe8 sp=0xc0000cffe0 pc=0x5bb7383edde1
created by runtime.gcBgMarkStartWorkers in goroutine 18
        runtime/mgc.go:1234 +0x1c

goroutine 11112 gp=0xc000156a80 m=nil [IO wait, 4 minutes]:
runtime.gopark(0x10?, 0x10?, 0xf0?, 0xbd?, 0xb?)
        runtime/proc.go:402 +0xce fp=0xc00019bda8 sp=0xc00019bd88 pc=0x5bb7383bc00e
runtime.netpollblock(0x5bb738422558?, 0x38384b26?, 0xb7?)
        runtime/netpoll.go:573 +0xf7 fp=0xc00019bde0 sp=0xc00019bda8 pc=0x5bb7383b4257
internal/poll.runtime_pollWait(0x7ae067dc7ee8, 0x72)
        runtime/netpoll.go:345 +0x85 fp=0xc00019be00 sp=0xc00019bde0 pc=0x5bb7383e8aa5
internal/poll.(*pollDesc).wait(0xc000164a00?, 0xc00010ab81?, 0x0)
        internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00019be28 sp=0xc00019be00 pc=0x5bb7384389c7
internal/poll.(*pollDesc).waitRead(...)
        internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0xc000164a00, {0xc00010ab81, 0x1, 0x1})
        internal/poll/fd_unix.go:164 +0x27a fp=0xc00019bec0 sp=0xc00019be28 pc=0x5bb73843951a
net.(*netFD).Read(0xc000164a00, {0xc00010ab81?, 0xc00019bf48?, 0x5bb7383ea6d0?})
        net/fd_posix.go:55 +0x25 fp=0xc00019bf08 sp=0xc00019bec0 pc=0x5bb7384a77a5
net.(*conn).Read(0xc00004a000, {0xc00010ab81?, 0x385041544f792f41?, 0xc00010ab78?})
        net/net.go:185 +0x45 fp=0xc00019bf50 sp=0xc00019bf08 pc=0x5bb7384b1a65
net.(*TCPConn).Read(0xc00010ab70?, {0xc00010ab81?, 0x3450472f58332f59?, 0x636f422b44786847?})
        <autogenerated>:1 +0x25 fp=0xc00019bf80 sp=0xc00019bf50 pc=0x5bb7384bd445
net/http.(*connReader).backgroundRead(0xc00010ab70)
        net/http/server.go:681 +0x37 fp=0xc00019bfc8 sp=0xc00019bf80 pc=0x5bb7385cc1d7
net/http.(*connReader).startBackgroundRead.gowrap2()
        net/http/server.go:677 +0x25 fp=0xc00019bfe0 sp=0xc00019bfc8 pc=0x5bb7385cc105
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc00019bfe8 sp=0xc00019bfe0 pc=0x5bb7383edde1
created by net/http.(*connReader).startBackgroundRead in goroutine 22
        net/http/server.go:677 +0xba

rax    0x204803fbc
rbx    0x7adfd17adce0
rcx    0xfef
rdx    0x7adfd10edb90
rdi    0x7adfd10edba0
rsi    0x0
rbp    0x7adffa7ddeb0
rsp    0x7adffa7dde90
r8     0x1
r9     0x7adfd16203b8
r10    0x0
r11    0x246
r12    0x7ade6000ccc0
r13    0x7adfd10edba0
r14    0x0
r15    0x7ae0b4ef57d0
rip    0x7ae06884d1d7
rflags 0x10297
cs     0x33
fs     0x0
gs     0x0
SIGABRT: abort
PC=0x7ae04269eb1c m=4 sigcode=18446744073709551610
signal arrived during cgo execution

goroutine 7 gp=0xc000156000 m=4 mp=0xc00004d808 [syscall]:
runtime.cgocall(0x5bb738602e90, 0xc000056b60)
        runtime/cgocall.go:157 +0x4b fp=0xc000056b38 sp=0xc000056b00 pc=0x5bb7383853cb
github.com/ollama/ollama/llama._Cfunc_llama_decode(0x7adfec006460, {0x1, 0x7adfec3acb70, 0x0, 0x0, 0x7adfec3ad380, 0x7adfec3adb90, 0x7adfec17b380, 0x7adfd10c2910,
0x0, ...})
        _cgo_gotypes.go:543 +0x52 fp=0xc000056b60 sp=0xc000056b38 pc=0x5bb738482952
github.com/ollama/ollama/llama.(*Context).Decode.func1(0x5bb7385fed4b?, 0x7adfec006460?)
        github.com/ollama/ollama/llama/llama.go:167 +0xd8 fp=0xc000056c80 sp=0xc000056b60 pc=0x5bb738484e78
github.com/ollama/ollama/llama.(*Context).Decode(0xc000056d68?, 0x1?)
        github.com/ollama/ollama/llama/llama.go:167 +0x17 fp=0xc000056cc8 sp=0xc000056c80 pc=0x5bb738484cd7
main.(*Server).processBatch(0xc000128120, 0xc000126150, 0xc0001261c0)
        github.com/ollama/ollama/llama/runner/runner.go:424 +0x29e fp=0xc000056ed0 sp=0xc000056cc8 pc=0x5bb7385fdd7e
main.(*Server).run(0xc000128120, {0x5bb73893ca40, 0xc00007c050})
        github.com/ollama/ollama/llama/runner/runner.go:338 +0x1a5 fp=0xc000056fb8 sp=0xc000056ed0 pc=0x5bb7385fd765
main.main.gowrap2()
        github.com/ollama/ollama/llama/runner/runner.go:901 +0x28 fp=0xc000056fe0 sp=0xc000056fb8 pc=0x5bb738601ec8
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc000056fe8 sp=0xc000056fe0 pc=0x5bb7383edde1
created by main.main in goroutine 1
        github.com/ollama/ollama/llama/runner/runner.go:901 +0xc2b

goroutine 1 gp=0xc0000061c0 m=nil [IO wait, 520 minutes]:
runtime.gopark(0xc000038a08?, 0xc00014b908?, 0xb1?, 0x7a?, 0x2000?)
        runtime/proc.go:402 +0xce fp=0xc00014b888 sp=0xc00014b868 pc=0x5bb7383bc00e
runtime.netpollblock(0xc00014b920?, 0x38384b26?, 0xb7?)
        runtime/netpoll.go:573 +0xf7 fp=0xc00014b8c0 sp=0xc00014b888 pc=0x5bb7383b4257
internal/poll.runtime_pollWait(0x7ae067dc7fe0, 0x72)
        runtime/netpoll.go:345 +0x85 fp=0xc00014b8e0 sp=0xc00014b8c0 pc=0x5bb7383e8aa5
internal/poll.(*pollDesc).wait(0x3?, 0x7c?, 0x0)
        internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00014b908 sp=0xc00014b8e0 pc=0x5bb7384389c7
internal/poll.(*pollDesc).waitRead(...)
        internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Accept(0xc000150080)
        internal/poll/fd_unix.go:611 +0x2ac fp=0xc00014b9b0 sp=0xc00014b908 pc=0x5bb738439e8c
net.(*netFD).accept(0xc000150080)
        net/fd_unix.go:172 +0x29 fp=0xc00014ba68 sp=0xc00014b9b0 pc=0x5bb7384a88a9
net.(*TCPListener).accept(0xc0000321e0)
        net/tcpsock_posix.go:159 +0x1e fp=0xc00014ba90 sp=0xc00014ba68 pc=0x5bb7384b95de
net.(*TCPListener).Accept(0xc0000321e0)
        net/tcpsock.go:327 +0x30 fp=0xc00014bac0 sp=0xc00014ba90 pc=0x5bb7384b8930
net/http.(*onceCloseListener).Accept(0xc000190090?)
        <autogenerated>:1 +0x24 fp=0xc00014bad8 sp=0xc00014bac0 pc=0x5bb7385dfa44
net/http.(*Server).Serve(0xc000168000, {0x5bb73893c400, 0xc0000321e0})
        net/http/server.go:3260 +0x33e fp=0xc00014bc08 sp=0xc00014bad8 pc=0x5bb7385d685e
main.main()
        github.com/ollama/ollama/llama/runner/runner.go:921 +0xfcc fp=0xc00014bf50 sp=0xc00014bc08 pc=0x5bb738601c4c
runtime.main()
        runtime/proc.go:271 +0x29d fp=0xc00014bfe0 sp=0xc00014bf50 pc=0x5bb7383bbbdd
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc00014bfe8 sp=0xc00014bfe0 pc=0x5bb7383edde1

goroutine 2 gp=0xc000006c40 m=nil [force gc (idle), 3 minutes]:
runtime.gopark(0x1dd19be52e23?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc000046fa8 sp=0xc000046f88 pc=0x5bb7383bc00e
runtime.goparkunlock(...)
        runtime/proc.go:408
runtime.forcegchelper()
        runtime/proc.go:326 +0xb8 fp=0xc000046fe0 sp=0xc000046fa8 pc=0x5bb7383bbe98
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc000046fe8 sp=0xc000046fe0 pc=0x5bb7383edde1
created by runtime.init.6 in goroutine 1
        runtime/proc.go:314 +0x1a

goroutine 3 gp=0xc000007180 m=nil [GC sweep wait]:
runtime.gopark(0x5bb738b09e01?, 0x5bb738b09e40?, 0xc?, 0x9?, 0x1?)
        runtime/proc.go:402 +0xce fp=0xc000047780 sp=0xc000047760 pc=0x5bb7383bc00e
runtime.goparkunlock(...)
        runtime/proc.go:408
runtime.bgsweep(0xc00006e000)
        runtime/mgcsweep.go:318 +0xdf fp=0xc0000477c8 sp=0xc000047780 pc=0x5bb7383a6b9f
runtime.gcenable.gowrap1()
        runtime/mgc.go:203 +0x25 fp=0xc0000477e0 sp=0xc0000477c8 pc=0x5bb73839b685
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000477e8 sp=0xc0000477e0 pc=0x5bb7383edde1
created by runtime.gcenable in goroutine 1
        runtime/mgc.go:203 +0x66

goroutine 4 gp=0xc000007340 m=nil [GC scavenge wait]:
runtime.gopark(0x10000?, 0x166b9ea?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc000047f78 sp=0xc000047f58 pc=0x5bb7383bc00e
runtime.goparkunlock(...)
        runtime/proc.go:408
runtime.(*scavengerState).park(0x5bb738b0a4c0)
        runtime/mgcscavenge.go:425 +0x49 fp=0xc000047fa8 sp=0xc000047f78 pc=0x5bb7383a4549
runtime.bgscavenge(0xc00006e000)
        runtime/mgcscavenge.go:658 +0x59 fp=0xc000047fc8 sp=0xc000047fa8 pc=0x5bb7383a4af9
runtime.gcenable.gowrap2()
        runtime/mgc.go:204 +0x25 fp=0xc000047fe0 sp=0xc000047fc8 pc=0x5bb73839b625
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc000047fe8 sp=0xc000047fe0 pc=0x5bb7383edde1
created by runtime.gcenable in goroutine 1
        runtime/mgc.go:204 +0xa5

goroutine 5 gp=0xc000007c00 m=nil [finalizer wait, 3 minutes]:
runtime.gopark(0x0?, 0x5bb7389381a0?, 0x0?, 0x60?, 0x1000000010?)
        runtime/proc.go:402 +0xce fp=0xc000046620 sp=0xc000046600 pc=0x5bb7383bc00e
runtime.runfinq()
        runtime/mfinal.go:194 +0x107 fp=0xc0000467e0 sp=0xc000046620 pc=0x5bb73839a6c7
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000467e8 sp=0xc0000467e0 pc=0x5bb7383edde1
created by runtime.createfing in goroutine 1
        runtime/mfinal.go:164 +0x3d

goroutine 22 gp=0xc000196000 m=nil [select]:
runtime.gopark(0xc000147a80?, 0x2?, 0x18?, 0x77?, 0xc000147824?)
        runtime/proc.go:402 +0xce fp=0xc000147698 sp=0xc000147678 pc=0x5bb7383bc00e
runtime.selectgo(0xc000147a80, 0xc000147820, 0xc00037de00?, 0x0, 0x2?, 0x1)
        runtime/select.go:327 +0x725 fp=0xc0001477b8 sp=0xc000147698 pc=0x5bb7383cd3e5
main.(*Server).completion(0xc000128120, {0x5bb73893c5b0, 0xc0000e22a0}, 0xc0000c0360)
        github.com/ollama/ollama/llama/runner/runner.go:652 +0x8fe fp=0xc000147ab8 sp=0xc0001477b8 pc=0x5bb7385ff6de
main.(*Server).completion-fm({0x5bb73893c5b0?, 0xc0000e22a0?}, 0x5bb7385dab8d?)
        <autogenerated>:1 +0x36 fp=0xc000147ae8 sp=0xc000147ab8 pc=0x5bb7386026b6
net/http.HandlerFunc.ServeHTTP(0xc00010cb60?, {0x5bb73893c5b0?, 0xc0000e22a0?}, 0x10?)
        net/http/server.go:2171 +0x29 fp=0xc000147b10 sp=0xc000147ae8 pc=0x5bb7385d3629
net/http.(*ServeMux).ServeHTTP(0x5bb73838ef85?, {0x5bb73893c5b0, 0xc0000e22a0}, 0xc0000c0360)
        net/http/server.go:2688 +0x1ad fp=0xc000147b60 sp=0xc000147b10 pc=0x5bb7385d54ad
net/http.serverHandler.ServeHTTP({0x5bb73893b900?}, {0x5bb73893c5b0?, 0xc0000e22a0?}, 0x6?)
        net/http/server.go:3142 +0x8e fp=0xc000147b90 sp=0xc000147b60 pc=0x5bb7385d64ce
net/http.(*conn).serve(0xc000190090, {0x5bb73893ca08, 0xc00010adb0})
        net/http/server.go:2044 +0x5e8 fp=0xc000147fb8 sp=0xc000147b90 pc=0x5bb7385d2268
net/http.(*Server).Serve.gowrap3()
        net/http/server.go:3290 +0x28 fp=0xc000147fe0 sp=0xc000147fb8 pc=0x5bb7385d6c48
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc000147fe8 sp=0xc000147fe0 pc=0x5bb7383edde1
created by net/http.(*Server).Serve in goroutine 1
        net/http/server.go:3290 +0x4b4

goroutine 21 gp=0xc000082a80 m=nil [GC worker (idle), 4 minutes]:
runtime.gopark(0x1db5aaee6275?, 0x3?, 0x58?, 0xf?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc0000cdf50 sp=0xc0000cdf30 pc=0x5bb7383bc00e
runtime.gcBgMarkWorker()
        runtime/mgc.go:1310 +0xe5 fp=0xc0000cdfe0 sp=0xc0000cdf50 pc=0x5bb73839d585
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cdfe8 sp=0xc0000cdfe0 pc=0x5bb7383edde1
created by runtime.gcBgMarkStartWorkers in goroutine 18
        runtime/mgc.go:1234 +0x1c

goroutine 41 gp=0xc000082fc0 m=nil [GC worker (idle), 3 minutes]:
runtime.gopark(0x1db5aaee606c?, 0x3?, 0xc?, 0xfe?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc0000cf750 sp=0xc0000cf730 pc=0x5bb7383bc00e
runtime.gcBgMarkWorker()
        runtime/mgc.go:1310 +0xe5 fp=0xc0000cf7e0 sp=0xc0000cf750 pc=0x5bb73839d585
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cf7e8 sp=0xc0000cf7e0 pc=0x5bb7383edde1
created by runtime.gcBgMarkStartWorkers in goroutine 18
        runtime/mgc.go:1234 +0x1c

goroutine 50 gp=0xc0005e2000 m=nil [GC worker (idle), 3 minutes]:
runtime.gopark(0x1dd19bf2c88b?, 0x3?, 0xbf?, 0x40?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc0000c8750 sp=0xc0000c8730 pc=0x5bb7383bc00e
runtime.gcBgMarkWorker()
        runtime/mgc.go:1310 +0xe5 fp=0xc0000c87e0 sp=0xc0000c8750 pc=0x5bb73839d585
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000c87e8 sp=0xc0000c87e0 pc=0x5bb7383edde1
created by runtime.gcBgMarkStartWorkers in goroutine 18
        runtime/mgc.go:1234 +0x1c

goroutine 42 gp=0xc000083180 m=nil [GC worker (idle), 66 minutes]:
runtime.gopark(0x1a5228359582?, 0x3?, 0x1c?, 0x4?, 0x0?)
        runtime/proc.go:402 +0xce fp=0xc0000cff50 sp=0xc0000cff30 pc=0x5bb7383bc00e
runtime.gcBgMarkWorker()
        runtime/mgc.go:1310 +0xe5 fp=0xc0000cffe0 sp=0xc0000cff50 pc=0x5bb73839d585
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cffe8 sp=0xc0000cffe0 pc=0x5bb7383edde1
created by runtime.gcBgMarkStartWorkers in goroutine 18
        runtime/mgc.go:1234 +0x1c

goroutine 11112 gp=0xc000156a80 m=nil [IO wait, 4 minutes]:
runtime.gopark(0x10?, 0x10?, 0xf0?, 0xbd?, 0xb?)
        runtime/proc.go:402 +0xce fp=0xc00019bda8 sp=0xc00019bd88 pc=0x5bb7383bc00e
runtime.netpollblock(0x5bb738422558?, 0x38384b26?, 0xb7?)
        runtime/netpoll.go:573 +0xf7 fp=0xc00019bde0 sp=0xc00019bda8 pc=0x5bb7383b4257
internal/poll.runtime_pollWait(0x7ae067dc7ee8, 0x72)
        runtime/netpoll.go:345 +0x85 fp=0xc00019be00 sp=0xc00019bde0 pc=0x5bb7383e8aa5
internal/poll.(*pollDesc).wait(0xc000164a00?, 0xc00010ab81?, 0x0)
        internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00019be28 sp=0xc00019be00 pc=0x5bb7384389c7
internal/poll.(*pollDesc).waitRead(...)
        internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0xc000164a00, {0xc00010ab81, 0x1, 0x1})
        internal/poll/fd_unix.go:164 +0x27a fp=0xc00019bec0 sp=0xc00019be28 pc=0x5bb73843951a
net.(*netFD).Read(0xc000164a00, {0xc00010ab81?, 0xc00019bf48?, 0x5bb7383ea6d0?})
        net/fd_posix.go:55 +0x25 fp=0xc00019bf08 sp=0xc00019bec0 pc=0x5bb7384a77a5
net.(*conn).Read(0xc00004a000, {0xc00010ab81?, 0x385041544f792f41?, 0xc00010ab78?})
        net/net.go:185 +0x45 fp=0xc00019bf50 sp=0xc00019bf08 pc=0x5bb7384b1a65
net.(*TCPConn).Read(0xc00010ab70?, {0xc00010ab81?, 0x3450472f58332f59?, 0x636f422b44786847?})
        <autogenerated>:1 +0x25 fp=0xc00019bf80 sp=0xc00019bf50 pc=0x5bb7384bd445
net/http.(*connReader).backgroundRead(0xc00010ab70)
        net/http/server.go:681 +0x37 fp=0xc00019bfc8 sp=0xc00019bf80 pc=0x5bb7385cc1d7
net/http.(*connReader).startBackgroundRead.gowrap2()
        net/http/server.go:677 +0x25 fp=0xc00019bfe0 sp=0xc00019bfc8 pc=0x5bb7385cc105
runtime.goexit({})
        runtime/asm_amd64.s:1695 +0x1 fp=0xc00019bfe8 sp=0xc00019bfe0 pc=0x5bb7383edde1
created by net/http.(*connReader).startBackgroundRead in goroutine 22
        net/http/server.go:677 +0xba

rax    0x0
rbx    0xa6ba
rcx    0x7ae04269eb1c
rdx    0x6
rdi    0xa6b7
rsi    0xa6ba
rbp    0x7adffa7de010
rsp    0x7adffa7ddfd0
r8     0x0
r9     0x0
r10    0x8
r11    0x246
r12    0x6
r13    0xfcc
r14    0x16
r15    0x0
rip    0x7ae04269eb1c
rflags 0x246
cs     0x33
fs     0x0
gs     0x0
[GIN] 2024/11/19 - 07:09:34 | 500 |         3m37s |       127.0.0.1 | POST     "/api/generate"

OS

Linux

GPU

Nvidia

CPU

AMD

Ollama version

0.4.1

Originally created by @pavelruzicka on GitHub (Nov 19, 2024). Original GitHub issue: https://github.com/ollama/ollama/issues/7748 ### What is the issue? On certain API requests, the server throws a segmentation fault error and the API responds with a HTTP 500. So far, I have encountered this twice in thousands of requests. Unfortunately I do not have the particular prompts that resulted in this logged but I do not expect this to be directly reproducible based on a prompt. Full stack trace: ``` ggml.c:4044: GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)) failed SIGSEGV: segmentation violation PC=0x7ae06884d1d7 m=4 sigcode=1 addr=0x204803fbc signal arrived during cgo execution goroutine 7 gp=0xc000156000 m=4 mp=0xc00004d808 [syscall]: runtime.cgocall(0x5bb738602e90, 0xc000056b60) runtime/cgocall.go:157 +0x4b fp=0xc000056b38 sp=0xc000056b00 pc=0x5bb7383853cb github.com/ollama/ollama/llama._Cfunc_llama_decode(0x7adfec006460, {0x1, 0x7adfec3acb70, 0x0, 0x0, 0x7adfec3ad380, 0x7adfec3adb90, 0x7adfec17b380, 0x7adfd10c2910, 0x0, ...}) _cgo_gotypes.go:543 +0x52 fp=0xc000056b60 sp=0xc000056b38 pc=0x5bb738482952 github.com/ollama/ollama/llama.(*Context).Decode.func1(0x5bb7385fed4b?, 0x7adfec006460?) github.com/ollama/ollama/llama/llama.go:167 +0xd8 fp=0xc000056c80 sp=0xc000056b60 pc=0x5bb738484e78 github.com/ollama/ollama/llama.(*Context).Decode(0xc000056d68?, 0x1?) github.com/ollama/ollama/llama/llama.go:167 +0x17 fp=0xc000056cc8 sp=0xc000056c80 pc=0x5bb738484cd7 main.(*Server).processBatch(0xc000128120, 0xc000126150, 0xc0001261c0) github.com/ollama/ollama/llama/runner/runner.go:424 +0x29e fp=0xc000056ed0 sp=0xc000056cc8 pc=0x5bb7385fdd7e main.(*Server).run(0xc000128120, {0x5bb73893ca40, 0xc00007c050}) github.com/ollama/ollama/llama/runner/runner.go:338 +0x1a5 fp=0xc000056fb8 sp=0xc000056ed0 pc=0x5bb7385fd765 main.main.gowrap2() github.com/ollama/ollama/llama/runner/runner.go:901 +0x28 fp=0xc000056fe0 sp=0xc000056fb8 pc=0x5bb738601ec8 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc000056fe8 sp=0xc000056fe0 pc=0x5bb7383edde1 created by main.main in goroutine 1 github.com/ollama/ollama/llama/runner/runner.go:901 +0xc2b goroutine 1 gp=0xc0000061c0 m=nil [IO wait, 520 minutes]: runtime.gopark(0xc000038a08?, 0xc00014b908?, 0xb1?, 0x7a?, 0x2000?) runtime/proc.go:402 +0xce fp=0xc00014b888 sp=0xc00014b868 pc=0x5bb7383bc00e runtime.netpollblock(0xc00014b920?, 0x38384b26?, 0xb7?) runtime/netpoll.go:573 +0xf7 fp=0xc00014b8c0 sp=0xc00014b888 pc=0x5bb7383b4257 internal/poll.runtime_pollWait(0x7ae067dc7fe0, 0x72) runtime/netpoll.go:345 +0x85 fp=0xc00014b8e0 sp=0xc00014b8c0 pc=0x5bb7383e8aa5 internal/poll.(*pollDesc).wait(0x3?, 0x7c?, 0x0) internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00014b908 sp=0xc00014b8e0 pc=0x5bb7384389c7 internal/poll.(*pollDesc).waitRead(...) internal/poll/fd_poll_runtime.go:89 internal/poll.(*FD).Accept(0xc000150080) internal/poll/fd_unix.go:611 +0x2ac fp=0xc00014b9b0 sp=0xc00014b908 pc=0x5bb738439e8c net.(*netFD).accept(0xc000150080) net/fd_unix.go:172 +0x29 fp=0xc00014ba68 sp=0xc00014b9b0 pc=0x5bb7384a88a9 net.(*TCPListener).accept(0xc0000321e0) net/tcpsock_posix.go:159 +0x1e fp=0xc00014ba90 sp=0xc00014ba68 pc=0x5bb7384b95de net.(*TCPListener).Accept(0xc0000321e0) net/tcpsock.go:327 +0x30 fp=0xc00014bac0 sp=0xc00014ba90 pc=0x5bb7384b8930 net/http.(*onceCloseListener).Accept(0xc000190090?) <autogenerated>:1 +0x24 fp=0xc00014bad8 sp=0xc00014bac0 pc=0x5bb7385dfa44 net/http.(*Server).Serve(0xc000168000, {0x5bb73893c400, 0xc0000321e0}) net/http/server.go:3260 +0x33e fp=0xc00014bc08 sp=0xc00014bad8 pc=0x5bb7385d685e main.main() github.com/ollama/ollama/llama/runner/runner.go:921 +0xfcc fp=0xc00014bf50 sp=0xc00014bc08 pc=0x5bb738601c4c runtime.main() runtime/proc.go:271 +0x29d fp=0xc00014bfe0 sp=0xc00014bf50 pc=0x5bb7383bbbdd runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc00014bfe8 sp=0xc00014bfe0 pc=0x5bb7383edde1 goroutine 2 gp=0xc000006c40 m=nil [force gc (idle), 3 minutes]: runtime.gopark(0x1dd19be52e23?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:402 +0xce fp=0xc000046fa8 sp=0xc000046f88 pc=0x5bb7383bc00e runtime.goparkunlock(...) runtime/proc.go:408 runtime.forcegchelper() runtime/proc.go:326 +0xb8 fp=0xc000046fe0 sp=0xc000046fa8 pc=0x5bb7383bbe98 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc000046fe8 sp=0xc000046fe0 pc=0x5bb7383edde1 created by runtime.init.6 in goroutine 1 runtime/proc.go:314 +0x1a goroutine 3 gp=0xc000007180 m=nil [GC sweep wait]: runtime.gopark(0x5bb738b09e01?, 0x5bb738b09e40?, 0xc?, 0x9?, 0x1?) runtime/proc.go:402 +0xce fp=0xc000047780 sp=0xc000047760 pc=0x5bb7383bc00e runtime.goparkunlock(...) runtime/proc.go:408 runtime.bgsweep(0xc00006e000) runtime/mgcsweep.go:318 +0xdf fp=0xc0000477c8 sp=0xc000047780 pc=0x5bb7383a6b9f runtime.gcenable.gowrap1() runtime/mgc.go:203 +0x25 fp=0xc0000477e0 sp=0xc0000477c8 pc=0x5bb73839b685 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000477e8 sp=0xc0000477e0 pc=0x5bb7383edde1 created by runtime.gcenable in goroutine 1 runtime/mgc.go:203 +0x66 goroutine 4 gp=0xc000007340 m=nil [GC scavenge wait]: runtime.gopark(0x10000?, 0x166b9ea?, 0x0?, 0x0?, 0x0?) runtime/proc.go:402 +0xce fp=0xc000047f78 sp=0xc000047f58 pc=0x5bb7383bc00e runtime.goparkunlock(...) runtime/proc.go:408 runtime.(*scavengerState).park(0x5bb738b0a4c0) runtime/mgcscavenge.go:425 +0x49 fp=0xc000047fa8 sp=0xc000047f78 pc=0x5bb7383a4549 runtime.bgscavenge(0xc00006e000) runtime/mgcscavenge.go:658 +0x59 fp=0xc000047fc8 sp=0xc000047fa8 pc=0x5bb7383a4af9 runtime.gcenable.gowrap2() runtime/mgc.go:204 +0x25 fp=0xc000047fe0 sp=0xc000047fc8 pc=0x5bb73839b625 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc000047fe8 sp=0xc000047fe0 pc=0x5bb7383edde1 created by runtime.gcenable in goroutine 1 runtime/mgc.go:204 +0xa5 goroutine 5 gp=0xc000007c00 m=nil [finalizer wait, 3 minutes]: runtime.gopark(0x0?, 0x5bb7389381a0?, 0x0?, 0x60?, 0x1000000010?) runtime/proc.go:402 +0xce fp=0xc000046620 sp=0xc000046600 pc=0x5bb7383bc00e runtime.runfinq() runtime/mfinal.go:194 +0x107 fp=0xc0000467e0 sp=0xc000046620 pc=0x5bb73839a6c7 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000467e8 sp=0xc0000467e0 pc=0x5bb7383edde1 created by runtime.createfing in goroutine 1 runtime/mfinal.go:164 +0x3d goroutine 22 gp=0xc000196000 m=nil [select]: runtime.gopark(0xc000147a80?, 0x2?, 0x18?, 0x77?, 0xc000147824?) runtime/proc.go:402 +0xce fp=0xc000147698 sp=0xc000147678 pc=0x5bb7383bc00e runtime.selectgo(0xc000147a80, 0xc000147820, 0xc00037de00?, 0x0, 0x2?, 0x1) runtime/select.go:327 +0x725 fp=0xc0001477b8 sp=0xc000147698 pc=0x5bb7383cd3e5 main.(*Server).completion(0xc000128120, {0x5bb73893c5b0, 0xc0000e22a0}, 0xc0000c0360) github.com/ollama/ollama/llama/runner/runner.go:652 +0x8fe fp=0xc000147ab8 sp=0xc0001477b8 pc=0x5bb7385ff6de main.(*Server).completion-fm({0x5bb73893c5b0?, 0xc0000e22a0?}, 0x5bb7385dab8d?) <autogenerated>:1 +0x36 fp=0xc000147ae8 sp=0xc000147ab8 pc=0x5bb7386026b6 net/http.HandlerFunc.ServeHTTP(0xc00010cb60?, {0x5bb73893c5b0?, 0xc0000e22a0?}, 0x10?) net/http/server.go:2171 +0x29 fp=0xc000147b10 sp=0xc000147ae8 pc=0x5bb7385d3629 net/http.(*ServeMux).ServeHTTP(0x5bb73838ef85?, {0x5bb73893c5b0, 0xc0000e22a0}, 0xc0000c0360) net/http/server.go:2688 +0x1ad fp=0xc000147b60 sp=0xc000147b10 pc=0x5bb7385d54ad net/http.serverHandler.ServeHTTP({0x5bb73893b900?}, {0x5bb73893c5b0?, 0xc0000e22a0?}, 0x6?) net/http/server.go:3142 +0x8e fp=0xc000147b90 sp=0xc000147b60 pc=0x5bb7385d64ce net/http.(*conn).serve(0xc000190090, {0x5bb73893ca08, 0xc00010adb0}) net/http/server.go:2044 +0x5e8 fp=0xc000147fb8 sp=0xc000147b90 pc=0x5bb7385d2268 net/http.(*Server).Serve.gowrap3() net/http/server.go:3290 +0x28 fp=0xc000147fe0 sp=0xc000147fb8 pc=0x5bb7385d6c48 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc000147fe8 sp=0xc000147fe0 pc=0x5bb7383edde1 created by net/http.(*Server).Serve in goroutine 1 net/http/server.go:3290 +0x4b4 goroutine 21 gp=0xc000082a80 m=nil [GC worker (idle), 4 minutes]: runtime.gopark(0x1db5aaee6275?, 0x3?, 0x58?, 0xf?, 0x0?) runtime/proc.go:402 +0xce fp=0xc0000cdf50 sp=0xc0000cdf30 pc=0x5bb7383bc00e runtime.gcBgMarkWorker() runtime/mgc.go:1310 +0xe5 fp=0xc0000cdfe0 sp=0xc0000cdf50 pc=0x5bb73839d585 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cdfe8 sp=0xc0000cdfe0 pc=0x5bb7383edde1 created by runtime.gcBgMarkStartWorkers in goroutine 18 runtime/mgc.go:1234 +0x1c goroutine 41 gp=0xc000082fc0 m=nil [GC worker (idle), 3 minutes]: runtime.gopark(0x1db5aaee606c?, 0x3?, 0xc?, 0xfe?, 0x0?) runtime/proc.go:402 +0xce fp=0xc0000cf750 sp=0xc0000cf730 pc=0x5bb7383bc00e runtime.gcBgMarkWorker() runtime/mgc.go:1310 +0xe5 fp=0xc0000cf7e0 sp=0xc0000cf750 pc=0x5bb73839d585 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cf7e8 sp=0xc0000cf7e0 pc=0x5bb7383edde1 created by runtime.gcBgMarkStartWorkers in goroutine 18 runtime/mgc.go:1234 +0x1c goroutine 50 gp=0xc0005e2000 m=nil [GC worker (idle), 3 minutes]: runtime.gopark(0x1dd19bf2c88b?, 0x3?, 0xbf?, 0x40?, 0x0?) runtime/proc.go:402 +0xce fp=0xc0000c8750 sp=0xc0000c8730 pc=0x5bb7383bc00e runtime.gcBgMarkWorker() runtime/mgc.go:1310 +0xe5 fp=0xc0000c87e0 sp=0xc0000c8750 pc=0x5bb73839d585 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000c87e8 sp=0xc0000c87e0 pc=0x5bb7383edde1 created by runtime.gcBgMarkStartWorkers in goroutine 18 runtime/mgc.go:1234 +0x1c goroutine 42 gp=0xc000083180 m=nil [GC worker (idle), 66 minutes]: runtime.gopark(0x1a5228359582?, 0x3?, 0x1c?, 0x4?, 0x0?) runtime/proc.go:402 +0xce fp=0xc0000cff50 sp=0xc0000cff30 pc=0x5bb7383bc00e runtime.gcBgMarkWorker() runtime/mgc.go:1310 +0xe5 fp=0xc0000cffe0 sp=0xc0000cff50 pc=0x5bb73839d585 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cffe8 sp=0xc0000cffe0 pc=0x5bb7383edde1 created by runtime.gcBgMarkStartWorkers in goroutine 18 runtime/mgc.go:1234 +0x1c goroutine 11112 gp=0xc000156a80 m=nil [IO wait, 4 minutes]: runtime.gopark(0x10?, 0x10?, 0xf0?, 0xbd?, 0xb?) runtime/proc.go:402 +0xce fp=0xc00019bda8 sp=0xc00019bd88 pc=0x5bb7383bc00e runtime.netpollblock(0x5bb738422558?, 0x38384b26?, 0xb7?) runtime/netpoll.go:573 +0xf7 fp=0xc00019bde0 sp=0xc00019bda8 pc=0x5bb7383b4257 internal/poll.runtime_pollWait(0x7ae067dc7ee8, 0x72) runtime/netpoll.go:345 +0x85 fp=0xc00019be00 sp=0xc00019bde0 pc=0x5bb7383e8aa5 internal/poll.(*pollDesc).wait(0xc000164a00?, 0xc00010ab81?, 0x0) internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00019be28 sp=0xc00019be00 pc=0x5bb7384389c7 internal/poll.(*pollDesc).waitRead(...) internal/poll/fd_poll_runtime.go:89 internal/poll.(*FD).Read(0xc000164a00, {0xc00010ab81, 0x1, 0x1}) internal/poll/fd_unix.go:164 +0x27a fp=0xc00019bec0 sp=0xc00019be28 pc=0x5bb73843951a net.(*netFD).Read(0xc000164a00, {0xc00010ab81?, 0xc00019bf48?, 0x5bb7383ea6d0?}) net/fd_posix.go:55 +0x25 fp=0xc00019bf08 sp=0xc00019bec0 pc=0x5bb7384a77a5 net.(*conn).Read(0xc00004a000, {0xc00010ab81?, 0x385041544f792f41?, 0xc00010ab78?}) net/net.go:185 +0x45 fp=0xc00019bf50 sp=0xc00019bf08 pc=0x5bb7384b1a65 net.(*TCPConn).Read(0xc00010ab70?, {0xc00010ab81?, 0x3450472f58332f59?, 0x636f422b44786847?}) <autogenerated>:1 +0x25 fp=0xc00019bf80 sp=0xc00019bf50 pc=0x5bb7384bd445 net/http.(*connReader).backgroundRead(0xc00010ab70) net/http/server.go:681 +0x37 fp=0xc00019bfc8 sp=0xc00019bf80 pc=0x5bb7385cc1d7 net/http.(*connReader).startBackgroundRead.gowrap2() net/http/server.go:677 +0x25 fp=0xc00019bfe0 sp=0xc00019bfc8 pc=0x5bb7385cc105 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc00019bfe8 sp=0xc00019bfe0 pc=0x5bb7383edde1 created by net/http.(*connReader).startBackgroundRead in goroutine 22 net/http/server.go:677 +0xba rax 0x204803fbc rbx 0x7adfd17adce0 rcx 0xfef rdx 0x7adfd10edb90 rdi 0x7adfd10edba0 rsi 0x0 rbp 0x7adffa7ddeb0 rsp 0x7adffa7dde90 r8 0x1 r9 0x7adfd16203b8 r10 0x0 r11 0x246 r12 0x7ade6000ccc0 r13 0x7adfd10edba0 r14 0x0 r15 0x7ae0b4ef57d0 rip 0x7ae06884d1d7 rflags 0x10297 cs 0x33 fs 0x0 gs 0x0 SIGABRT: abort PC=0x7ae04269eb1c m=4 sigcode=18446744073709551610 signal arrived during cgo execution goroutine 7 gp=0xc000156000 m=4 mp=0xc00004d808 [syscall]: runtime.cgocall(0x5bb738602e90, 0xc000056b60) runtime/cgocall.go:157 +0x4b fp=0xc000056b38 sp=0xc000056b00 pc=0x5bb7383853cb github.com/ollama/ollama/llama._Cfunc_llama_decode(0x7adfec006460, {0x1, 0x7adfec3acb70, 0x0, 0x0, 0x7adfec3ad380, 0x7adfec3adb90, 0x7adfec17b380, 0x7adfd10c2910, 0x0, ...}) _cgo_gotypes.go:543 +0x52 fp=0xc000056b60 sp=0xc000056b38 pc=0x5bb738482952 github.com/ollama/ollama/llama.(*Context).Decode.func1(0x5bb7385fed4b?, 0x7adfec006460?) github.com/ollama/ollama/llama/llama.go:167 +0xd8 fp=0xc000056c80 sp=0xc000056b60 pc=0x5bb738484e78 github.com/ollama/ollama/llama.(*Context).Decode(0xc000056d68?, 0x1?) github.com/ollama/ollama/llama/llama.go:167 +0x17 fp=0xc000056cc8 sp=0xc000056c80 pc=0x5bb738484cd7 main.(*Server).processBatch(0xc000128120, 0xc000126150, 0xc0001261c0) github.com/ollama/ollama/llama/runner/runner.go:424 +0x29e fp=0xc000056ed0 sp=0xc000056cc8 pc=0x5bb7385fdd7e main.(*Server).run(0xc000128120, {0x5bb73893ca40, 0xc00007c050}) github.com/ollama/ollama/llama/runner/runner.go:338 +0x1a5 fp=0xc000056fb8 sp=0xc000056ed0 pc=0x5bb7385fd765 main.main.gowrap2() github.com/ollama/ollama/llama/runner/runner.go:901 +0x28 fp=0xc000056fe0 sp=0xc000056fb8 pc=0x5bb738601ec8 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc000056fe8 sp=0xc000056fe0 pc=0x5bb7383edde1 created by main.main in goroutine 1 github.com/ollama/ollama/llama/runner/runner.go:901 +0xc2b goroutine 1 gp=0xc0000061c0 m=nil [IO wait, 520 minutes]: runtime.gopark(0xc000038a08?, 0xc00014b908?, 0xb1?, 0x7a?, 0x2000?) runtime/proc.go:402 +0xce fp=0xc00014b888 sp=0xc00014b868 pc=0x5bb7383bc00e runtime.netpollblock(0xc00014b920?, 0x38384b26?, 0xb7?) runtime/netpoll.go:573 +0xf7 fp=0xc00014b8c0 sp=0xc00014b888 pc=0x5bb7383b4257 internal/poll.runtime_pollWait(0x7ae067dc7fe0, 0x72) runtime/netpoll.go:345 +0x85 fp=0xc00014b8e0 sp=0xc00014b8c0 pc=0x5bb7383e8aa5 internal/poll.(*pollDesc).wait(0x3?, 0x7c?, 0x0) internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00014b908 sp=0xc00014b8e0 pc=0x5bb7384389c7 internal/poll.(*pollDesc).waitRead(...) internal/poll/fd_poll_runtime.go:89 internal/poll.(*FD).Accept(0xc000150080) internal/poll/fd_unix.go:611 +0x2ac fp=0xc00014b9b0 sp=0xc00014b908 pc=0x5bb738439e8c net.(*netFD).accept(0xc000150080) net/fd_unix.go:172 +0x29 fp=0xc00014ba68 sp=0xc00014b9b0 pc=0x5bb7384a88a9 net.(*TCPListener).accept(0xc0000321e0) net/tcpsock_posix.go:159 +0x1e fp=0xc00014ba90 sp=0xc00014ba68 pc=0x5bb7384b95de net.(*TCPListener).Accept(0xc0000321e0) net/tcpsock.go:327 +0x30 fp=0xc00014bac0 sp=0xc00014ba90 pc=0x5bb7384b8930 net/http.(*onceCloseListener).Accept(0xc000190090?) <autogenerated>:1 +0x24 fp=0xc00014bad8 sp=0xc00014bac0 pc=0x5bb7385dfa44 net/http.(*Server).Serve(0xc000168000, {0x5bb73893c400, 0xc0000321e0}) net/http/server.go:3260 +0x33e fp=0xc00014bc08 sp=0xc00014bad8 pc=0x5bb7385d685e main.main() github.com/ollama/ollama/llama/runner/runner.go:921 +0xfcc fp=0xc00014bf50 sp=0xc00014bc08 pc=0x5bb738601c4c runtime.main() runtime/proc.go:271 +0x29d fp=0xc00014bfe0 sp=0xc00014bf50 pc=0x5bb7383bbbdd runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc00014bfe8 sp=0xc00014bfe0 pc=0x5bb7383edde1 goroutine 2 gp=0xc000006c40 m=nil [force gc (idle), 3 minutes]: runtime.gopark(0x1dd19be52e23?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:402 +0xce fp=0xc000046fa8 sp=0xc000046f88 pc=0x5bb7383bc00e runtime.goparkunlock(...) runtime/proc.go:408 runtime.forcegchelper() runtime/proc.go:326 +0xb8 fp=0xc000046fe0 sp=0xc000046fa8 pc=0x5bb7383bbe98 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc000046fe8 sp=0xc000046fe0 pc=0x5bb7383edde1 created by runtime.init.6 in goroutine 1 runtime/proc.go:314 +0x1a goroutine 3 gp=0xc000007180 m=nil [GC sweep wait]: runtime.gopark(0x5bb738b09e01?, 0x5bb738b09e40?, 0xc?, 0x9?, 0x1?) runtime/proc.go:402 +0xce fp=0xc000047780 sp=0xc000047760 pc=0x5bb7383bc00e runtime.goparkunlock(...) runtime/proc.go:408 runtime.bgsweep(0xc00006e000) runtime/mgcsweep.go:318 +0xdf fp=0xc0000477c8 sp=0xc000047780 pc=0x5bb7383a6b9f runtime.gcenable.gowrap1() runtime/mgc.go:203 +0x25 fp=0xc0000477e0 sp=0xc0000477c8 pc=0x5bb73839b685 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000477e8 sp=0xc0000477e0 pc=0x5bb7383edde1 created by runtime.gcenable in goroutine 1 runtime/mgc.go:203 +0x66 goroutine 4 gp=0xc000007340 m=nil [GC scavenge wait]: runtime.gopark(0x10000?, 0x166b9ea?, 0x0?, 0x0?, 0x0?) runtime/proc.go:402 +0xce fp=0xc000047f78 sp=0xc000047f58 pc=0x5bb7383bc00e runtime.goparkunlock(...) runtime/proc.go:408 runtime.(*scavengerState).park(0x5bb738b0a4c0) runtime/mgcscavenge.go:425 +0x49 fp=0xc000047fa8 sp=0xc000047f78 pc=0x5bb7383a4549 runtime.bgscavenge(0xc00006e000) runtime/mgcscavenge.go:658 +0x59 fp=0xc000047fc8 sp=0xc000047fa8 pc=0x5bb7383a4af9 runtime.gcenable.gowrap2() runtime/mgc.go:204 +0x25 fp=0xc000047fe0 sp=0xc000047fc8 pc=0x5bb73839b625 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc000047fe8 sp=0xc000047fe0 pc=0x5bb7383edde1 created by runtime.gcenable in goroutine 1 runtime/mgc.go:204 +0xa5 goroutine 5 gp=0xc000007c00 m=nil [finalizer wait, 3 minutes]: runtime.gopark(0x0?, 0x5bb7389381a0?, 0x0?, 0x60?, 0x1000000010?) runtime/proc.go:402 +0xce fp=0xc000046620 sp=0xc000046600 pc=0x5bb7383bc00e runtime.runfinq() runtime/mfinal.go:194 +0x107 fp=0xc0000467e0 sp=0xc000046620 pc=0x5bb73839a6c7 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000467e8 sp=0xc0000467e0 pc=0x5bb7383edde1 created by runtime.createfing in goroutine 1 runtime/mfinal.go:164 +0x3d goroutine 22 gp=0xc000196000 m=nil [select]: runtime.gopark(0xc000147a80?, 0x2?, 0x18?, 0x77?, 0xc000147824?) runtime/proc.go:402 +0xce fp=0xc000147698 sp=0xc000147678 pc=0x5bb7383bc00e runtime.selectgo(0xc000147a80, 0xc000147820, 0xc00037de00?, 0x0, 0x2?, 0x1) runtime/select.go:327 +0x725 fp=0xc0001477b8 sp=0xc000147698 pc=0x5bb7383cd3e5 main.(*Server).completion(0xc000128120, {0x5bb73893c5b0, 0xc0000e22a0}, 0xc0000c0360) github.com/ollama/ollama/llama/runner/runner.go:652 +0x8fe fp=0xc000147ab8 sp=0xc0001477b8 pc=0x5bb7385ff6de main.(*Server).completion-fm({0x5bb73893c5b0?, 0xc0000e22a0?}, 0x5bb7385dab8d?) <autogenerated>:1 +0x36 fp=0xc000147ae8 sp=0xc000147ab8 pc=0x5bb7386026b6 net/http.HandlerFunc.ServeHTTP(0xc00010cb60?, {0x5bb73893c5b0?, 0xc0000e22a0?}, 0x10?) net/http/server.go:2171 +0x29 fp=0xc000147b10 sp=0xc000147ae8 pc=0x5bb7385d3629 net/http.(*ServeMux).ServeHTTP(0x5bb73838ef85?, {0x5bb73893c5b0, 0xc0000e22a0}, 0xc0000c0360) net/http/server.go:2688 +0x1ad fp=0xc000147b60 sp=0xc000147b10 pc=0x5bb7385d54ad net/http.serverHandler.ServeHTTP({0x5bb73893b900?}, {0x5bb73893c5b0?, 0xc0000e22a0?}, 0x6?) net/http/server.go:3142 +0x8e fp=0xc000147b90 sp=0xc000147b60 pc=0x5bb7385d64ce net/http.(*conn).serve(0xc000190090, {0x5bb73893ca08, 0xc00010adb0}) net/http/server.go:2044 +0x5e8 fp=0xc000147fb8 sp=0xc000147b90 pc=0x5bb7385d2268 net/http.(*Server).Serve.gowrap3() net/http/server.go:3290 +0x28 fp=0xc000147fe0 sp=0xc000147fb8 pc=0x5bb7385d6c48 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc000147fe8 sp=0xc000147fe0 pc=0x5bb7383edde1 created by net/http.(*Server).Serve in goroutine 1 net/http/server.go:3290 +0x4b4 goroutine 21 gp=0xc000082a80 m=nil [GC worker (idle), 4 minutes]: runtime.gopark(0x1db5aaee6275?, 0x3?, 0x58?, 0xf?, 0x0?) runtime/proc.go:402 +0xce fp=0xc0000cdf50 sp=0xc0000cdf30 pc=0x5bb7383bc00e runtime.gcBgMarkWorker() runtime/mgc.go:1310 +0xe5 fp=0xc0000cdfe0 sp=0xc0000cdf50 pc=0x5bb73839d585 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cdfe8 sp=0xc0000cdfe0 pc=0x5bb7383edde1 created by runtime.gcBgMarkStartWorkers in goroutine 18 runtime/mgc.go:1234 +0x1c goroutine 41 gp=0xc000082fc0 m=nil [GC worker (idle), 3 minutes]: runtime.gopark(0x1db5aaee606c?, 0x3?, 0xc?, 0xfe?, 0x0?) runtime/proc.go:402 +0xce fp=0xc0000cf750 sp=0xc0000cf730 pc=0x5bb7383bc00e runtime.gcBgMarkWorker() runtime/mgc.go:1310 +0xe5 fp=0xc0000cf7e0 sp=0xc0000cf750 pc=0x5bb73839d585 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cf7e8 sp=0xc0000cf7e0 pc=0x5bb7383edde1 created by runtime.gcBgMarkStartWorkers in goroutine 18 runtime/mgc.go:1234 +0x1c goroutine 50 gp=0xc0005e2000 m=nil [GC worker (idle), 3 minutes]: runtime.gopark(0x1dd19bf2c88b?, 0x3?, 0xbf?, 0x40?, 0x0?) runtime/proc.go:402 +0xce fp=0xc0000c8750 sp=0xc0000c8730 pc=0x5bb7383bc00e runtime.gcBgMarkWorker() runtime/mgc.go:1310 +0xe5 fp=0xc0000c87e0 sp=0xc0000c8750 pc=0x5bb73839d585 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000c87e8 sp=0xc0000c87e0 pc=0x5bb7383edde1 created by runtime.gcBgMarkStartWorkers in goroutine 18 runtime/mgc.go:1234 +0x1c goroutine 42 gp=0xc000083180 m=nil [GC worker (idle), 66 minutes]: runtime.gopark(0x1a5228359582?, 0x3?, 0x1c?, 0x4?, 0x0?) runtime/proc.go:402 +0xce fp=0xc0000cff50 sp=0xc0000cff30 pc=0x5bb7383bc00e runtime.gcBgMarkWorker() runtime/mgc.go:1310 +0xe5 fp=0xc0000cffe0 sp=0xc0000cff50 pc=0x5bb73839d585 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc0000cffe8 sp=0xc0000cffe0 pc=0x5bb7383edde1 created by runtime.gcBgMarkStartWorkers in goroutine 18 runtime/mgc.go:1234 +0x1c goroutine 11112 gp=0xc000156a80 m=nil [IO wait, 4 minutes]: runtime.gopark(0x10?, 0x10?, 0xf0?, 0xbd?, 0xb?) runtime/proc.go:402 +0xce fp=0xc00019bda8 sp=0xc00019bd88 pc=0x5bb7383bc00e runtime.netpollblock(0x5bb738422558?, 0x38384b26?, 0xb7?) runtime/netpoll.go:573 +0xf7 fp=0xc00019bde0 sp=0xc00019bda8 pc=0x5bb7383b4257 internal/poll.runtime_pollWait(0x7ae067dc7ee8, 0x72) runtime/netpoll.go:345 +0x85 fp=0xc00019be00 sp=0xc00019bde0 pc=0x5bb7383e8aa5 internal/poll.(*pollDesc).wait(0xc000164a00?, 0xc00010ab81?, 0x0) internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00019be28 sp=0xc00019be00 pc=0x5bb7384389c7 internal/poll.(*pollDesc).waitRead(...) internal/poll/fd_poll_runtime.go:89 internal/poll.(*FD).Read(0xc000164a00, {0xc00010ab81, 0x1, 0x1}) internal/poll/fd_unix.go:164 +0x27a fp=0xc00019bec0 sp=0xc00019be28 pc=0x5bb73843951a net.(*netFD).Read(0xc000164a00, {0xc00010ab81?, 0xc00019bf48?, 0x5bb7383ea6d0?}) net/fd_posix.go:55 +0x25 fp=0xc00019bf08 sp=0xc00019bec0 pc=0x5bb7384a77a5 net.(*conn).Read(0xc00004a000, {0xc00010ab81?, 0x385041544f792f41?, 0xc00010ab78?}) net/net.go:185 +0x45 fp=0xc00019bf50 sp=0xc00019bf08 pc=0x5bb7384b1a65 net.(*TCPConn).Read(0xc00010ab70?, {0xc00010ab81?, 0x3450472f58332f59?, 0x636f422b44786847?}) <autogenerated>:1 +0x25 fp=0xc00019bf80 sp=0xc00019bf50 pc=0x5bb7384bd445 net/http.(*connReader).backgroundRead(0xc00010ab70) net/http/server.go:681 +0x37 fp=0xc00019bfc8 sp=0xc00019bf80 pc=0x5bb7385cc1d7 net/http.(*connReader).startBackgroundRead.gowrap2() net/http/server.go:677 +0x25 fp=0xc00019bfe0 sp=0xc00019bfc8 pc=0x5bb7385cc105 runtime.goexit({}) runtime/asm_amd64.s:1695 +0x1 fp=0xc00019bfe8 sp=0xc00019bfe0 pc=0x5bb7383edde1 created by net/http.(*connReader).startBackgroundRead in goroutine 22 net/http/server.go:677 +0xba rax 0x0 rbx 0xa6ba rcx 0x7ae04269eb1c rdx 0x6 rdi 0xa6b7 rsi 0xa6ba rbp 0x7adffa7de010 rsp 0x7adffa7ddfd0 r8 0x0 r9 0x0 r10 0x8 r11 0x246 r12 0x6 r13 0xfcc r14 0x16 r15 0x0 rip 0x7ae04269eb1c rflags 0x246 cs 0x33 fs 0x0 gs 0x0 [GIN] 2024/11/19 - 07:09:34 | 500 | 3m37s | 127.0.0.1 | POST "/api/generate" ``` ### OS Linux ### GPU Nvidia ### CPU AMD ### Ollama version 0.4.1
GiteaMirror added the bug label 2026-04-12 16:00:23 -05:00
Author
Owner

@rick-github commented on GitHub (Nov 19, 2024):

Earlier parts of the server logs will contain information about configuration, resources, allocations, etc. Please include those.

<!-- gh-comment-id:2486931225 --> @rick-github commented on GitHub (Nov 19, 2024): Earlier parts of the server logs will contain information about configuration, resources, allocations, etc. Please include those.
Author
Owner

@pavelruzicka commented on GitHub (Nov 20, 2024):

Here is the server log from startup to until llama3.2-vision:11b-instruct-q8_0 has finished loading:

2024/11/19 01:11:54 routes.go:1189: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/pavelruzicka/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
time=2024-11-19T01:11:54.135+01:00 level=INFO source=images.go:755 msg="total blobs: 10"
time=2024-11-19T01:11:54.136+01:00 level=INFO source=images.go:762 msg="total unused blobs removed: 0"
time=2024-11-19T01:11:54.136+01:00 level=INFO source=routes.go:1240 msg="Listening on 127.0.0.1:11434 (version 0.4.1)"
time=2024-11-19T01:11:54.136+01:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3801146460/runners
time=2024-11-19T01:11:54.294+01:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v11 cuda_v12 rocm cpu cpu_avx cpu_avx2]"
time=2024-11-19T01:11:54.294+01:00 level=INFO source=gpu.go:221 msg="looking for compatible GPUs"
time=2024-11-19T01:11:54.608+01:00 level=INFO source=types.go:123 msg="inference compute" id=GPU-c3524221-981b-b586-1371-190b0a2c30a6 library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3090" total="23.6 GiB" available="23.3 GiB"
[GIN] 2024/11/19 - 01:12:18 | 200 |      34.263µs |       127.0.0.1 | HEAD     "/"
[GIN] 2024/11/19 - 01:12:18 | 200 |   32.895812ms |       127.0.0.1 | POST     "/api/show"
time=2024-11-19T01:12:18.386+01:00 level=WARN source=sched.go:137 msg="mllama doesn't support parallel requests yet"
time=2024-11-19T01:12:18.647+01:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/home/pavelruzicka/.ollama/models/blobs/sha256-7ef0839fb71fbab13fda97c1b9819ffd99c799ba4f93d421ae1e2a46d68c5fa6 gpu=GPU-c3524221-981b-b586-1371-190b0a2c30a6 parallel=1 available=25045368832 required="15.3 GiB"
time=2024-11-19T01:12:18.828+01:00 level=INFO source=server.go:105 msg="system memory" total="31.3 GiB" free="29.1 GiB" free_swap="8.0 GiB"
time=2024-11-19T01:12:18.831+01:00 level=INFO source=memory.go:343 msg="offload to cuda" projector.weights="1.8 GiB" projector.graph="2.8 GiB" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[23.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="15.3 GiB" memory.required.partial="15.3 GiB" memory.required.kv="656.2 MiB" memory.required.allocations="[15.3 GiB]" memory.weights.total="9.3 GiB" memory.weights.repeating="8.8 GiB" memory.weights.nonrepeating="532.3 MiB" memory.graph.full="258.5 MiB" memory.graph.partial="669.5 MiB"
time=2024-11-19T01:12:18.833+01:00 level=INFO source=server.go:383 msg="starting llama server" cmd="/tmp/ollama3801146460/runners/cuda_v12/ollama_llama_server --model /home/pavelruzicka/.ollama/models/blobs/sha256-7ef0839fb71fbab13fda97c1b9819ffd99c799ba4f93d421ae1e2a46d68c5fa6 --ctx-size 2048 --batch-size 512 --n-gpu-layers 41 --mmproj /home/pavelruzicka/.ollama/models/blobs/sha256-ece5e659647a20a5c28ab9eea1c12a1ad430bc0f2a27021d00ad103b3bf5206f --threads 4 --parallel 1 --port 34163"
time=2024-11-19T01:12:18.834+01:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
time=2024-11-19T01:12:18.834+01:00 level=INFO source=server.go:562 msg="waiting for llama runner to start responding"
time=2024-11-19T01:12:18.834+01:00 level=INFO source=server.go:596 msg="waiting for server to become available" status="llm server error"
time=2024-11-19T01:12:18.877+01:00 level=INFO source=runner.go:863 msg="starting go runner"
time=2024-11-19T01:12:18.877+01:00 level=INFO source=runner.go:864 msg=system info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | RISCV_VECT = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | cgo(gcc)" threads=4
time=2024-11-19T01:12:18.877+01:00 level=INFO source=.:0 msg="Server listening on 127.0.0.1:34163"
llama_model_loader: loaded meta data with 27 key-value pairs and 396 tensors from /home/pavelruzicka/.ollama/models/blobs/sha256-7ef0839fb71fbab13fda97c1b9819ffd99c799ba4f93d421ae1e2a46d68c5fa6 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = mllama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Model
llama_model_loader: - kv   3:                         general.size_label str              = 10B
llama_model_loader: - kv   4:                         mllama.block_count u32              = 40
llama_model_loader: - kv   5:                      mllama.context_length u32              = 131072
llama_model_loader: - kv   6:                    mllama.embedding_length u32              = 4096
llama_model_loader: - kv   7:                 mllama.feed_forward_length u32              = 14336
llama_model_loader: - kv   8:                mllama.attention.head_count u32              = 32
llama_model_loader: - kv   9:             mllama.attention.head_count_kv u32              = 8
llama_model_loader: - kv  10:                      mllama.rope.freq_base f32              = 500000.000000
llama_model_loader: - kv  11:    mllama.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  12:                          general.file_type u32              = 7
llama_model_loader: - kv  13:                          mllama.vocab_size u32              = 128256
llama_model_loader: - kv  14:                mllama.rope.dimension_count u32              = 128
llama_model_loader: - kv  15:    mllama.attention.cross_attention_layers arr[i32,8]       = [3, 8, 13, 18, 23, 28, 33, 38]
llama_model_loader: - kv  16:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  17:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  18:                         tokenizer.ggml.pre str              = llama-bpe
llama_model_loader: - kv  19:                      tokenizer.ggml.tokens arr[str,128257]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  20:                  tokenizer.ggml.token_type arr[i32,128257]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
time=2024-11-19T01:12:19.086+01:00 level=INFO source=server.go:596 msg="waiting for server to become available" status="llm server loading model"
llama_model_loader: - kv  21:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
llama_model_loader: - kv  22:                tokenizer.ggml.bos_token_id u32              = 128000
llama_model_loader: - kv  23:                tokenizer.ggml.eos_token_id u32              = 128009
llama_model_loader: - kv  24:            tokenizer.ggml.padding_token_id u32              = 128004
llama_model_loader: - kv  25:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
llama_model_loader: - kv  26:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:  114 tensors
llama_model_loader: - type q8_0:  282 tensors
llm_load_vocab: special tokens cache size = 257
llm_load_vocab: token to piece cache size = 0.7999 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = mllama
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 128256
llm_load_print_meta: n_merges         = 280147
llm_load_print_meta: vocab_only       = 0
llm_load_print_meta: n_ctx_train      = 131072
llm_load_print_meta: n_embd           = 4096
llm_load_print_meta: n_layer          = 40
llm_load_print_meta: n_head           = 32
llm_load_print_meta: n_head_kv        = 8
llm_load_print_meta: n_rot            = 128
llm_load_print_meta: n_swa            = 0
llm_load_print_meta: n_embd_head_k    = 128
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 4
llm_load_print_meta: n_embd_k_gqa     = 1024
llm_load_print_meta: n_embd_v_gqa     = 1024
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
llm_load_print_meta: f_clamp_kqv      = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale    = 0.0e+00
llm_load_print_meta: n_ff             = 14336
llm_load_print_meta: n_expert         = 0
llm_load_print_meta: n_expert_used    = 0
llm_load_print_meta: causal attn      = 1
llm_load_print_meta: pooling type     = 0
llm_load_print_meta: rope type        = 0
llm_load_print_meta: rope scaling     = linear
llm_load_print_meta: freq_base_train  = 500000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_ctx_orig_yarn  = 131072
llm_load_print_meta: rope_finetuned   = unknown
llm_load_print_meta: ssm_d_conv       = 0
llm_load_print_meta: ssm_d_inner      = 0
llm_load_print_meta: ssm_d_state      = 0
llm_load_print_meta: ssm_dt_rank      = 0
llm_load_print_meta: ssm_dt_b_c_rms   = 0
llm_load_print_meta: model type       = 11B
llm_load_print_meta: model ftype      = Q8_0
llm_load_print_meta: model params     = 9.78 B
llm_load_print_meta: model size       = 9.67 GiB (8.50 BPW)
llm_load_print_meta: general.name     = Model
llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
llm_load_print_meta: PAD token        = 128004 '<|finetune_right_pad_id|>'
llm_load_print_meta: LF token         = 128 'Ä'
llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
llm_load_print_meta: EOM token        = 128008 '<|eom_id|>'
llm_load_print_meta: EOG token        = 128008 '<|eom_id|>'
llm_load_print_meta: EOG token        = 128009 '<|eot_id|>'
llm_load_print_meta: max token length = 256
llama_model_load: vocab mismatch 128256 !- 128257 ...
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
llm_load_tensors: ggml ctx size =    0.36 MiB
llm_load_tensors: offloading 40 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 41/41 layers to GPU
llm_load_tensors:        CPU buffer size =   532.35 MiB
llm_load_tensors:      CUDA0 buffer size =  9373.60 MiB
llama_new_context_with_model: n_ctx      = 2048
llama_new_context_with_model: n_batch    = 512
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base  = 500000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:      CUDA0 KV buffer size =   656.25 MiB
llama_new_context_with_model: KV self size  =  656.25 MiB, K (f16):  328.12 MiB, V (f16):  328.12 MiB
llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 2
mllama_model_load: model name:   Llama-3.2-11B-Vision-Instruct
mllama_model_load: description:  vision encoder for Mllama
mllama_model_load: GGUF version: 3
mllama_model_load: alignment:    32
mllama_model_load: n_tensors:    512
mllama_model_load: n_kv:         17
mllama_model_load: ftype:        f16
mllama_model_load:
mllama_model_load: vision using CUDA backend
mllama_model_load: compute allocated memory: 2853.34 MB
time=2024-11-19T01:12:44.441+01:00 level=INFO source=server.go:601 msg="llama runner started in 25.61 seconds"
<!-- gh-comment-id:2487025912 --> @pavelruzicka commented on GitHub (Nov 20, 2024): Here is the server log from startup to until `llama3.2-vision:11b-instruct-q8_0` has finished loading: ``` 2024/11/19 01:11:54 routes.go:1189: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/pavelruzicka/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]" time=2024-11-19T01:11:54.135+01:00 level=INFO source=images.go:755 msg="total blobs: 10" time=2024-11-19T01:11:54.136+01:00 level=INFO source=images.go:762 msg="total unused blobs removed: 0" time=2024-11-19T01:11:54.136+01:00 level=INFO source=routes.go:1240 msg="Listening on 127.0.0.1:11434 (version 0.4.1)" time=2024-11-19T01:11:54.136+01:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3801146460/runners time=2024-11-19T01:11:54.294+01:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v11 cuda_v12 rocm cpu cpu_avx cpu_avx2]" time=2024-11-19T01:11:54.294+01:00 level=INFO source=gpu.go:221 msg="looking for compatible GPUs" time=2024-11-19T01:11:54.608+01:00 level=INFO source=types.go:123 msg="inference compute" id=GPU-c3524221-981b-b586-1371-190b0a2c30a6 library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3090" total="23.6 GiB" available="23.3 GiB" [GIN] 2024/11/19 - 01:12:18 | 200 | 34.263µs | 127.0.0.1 | HEAD "/" [GIN] 2024/11/19 - 01:12:18 | 200 | 32.895812ms | 127.0.0.1 | POST "/api/show" time=2024-11-19T01:12:18.386+01:00 level=WARN source=sched.go:137 msg="mllama doesn't support parallel requests yet" time=2024-11-19T01:12:18.647+01:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/home/pavelruzicka/.ollama/models/blobs/sha256-7ef0839fb71fbab13fda97c1b9819ffd99c799ba4f93d421ae1e2a46d68c5fa6 gpu=GPU-c3524221-981b-b586-1371-190b0a2c30a6 parallel=1 available=25045368832 required="15.3 GiB" time=2024-11-19T01:12:18.828+01:00 level=INFO source=server.go:105 msg="system memory" total="31.3 GiB" free="29.1 GiB" free_swap="8.0 GiB" time=2024-11-19T01:12:18.831+01:00 level=INFO source=memory.go:343 msg="offload to cuda" projector.weights="1.8 GiB" projector.graph="2.8 GiB" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[23.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="15.3 GiB" memory.required.partial="15.3 GiB" memory.required.kv="656.2 MiB" memory.required.allocations="[15.3 GiB]" memory.weights.total="9.3 GiB" memory.weights.repeating="8.8 GiB" memory.weights.nonrepeating="532.3 MiB" memory.graph.full="258.5 MiB" memory.graph.partial="669.5 MiB" time=2024-11-19T01:12:18.833+01:00 level=INFO source=server.go:383 msg="starting llama server" cmd="/tmp/ollama3801146460/runners/cuda_v12/ollama_llama_server --model /home/pavelruzicka/.ollama/models/blobs/sha256-7ef0839fb71fbab13fda97c1b9819ffd99c799ba4f93d421ae1e2a46d68c5fa6 --ctx-size 2048 --batch-size 512 --n-gpu-layers 41 --mmproj /home/pavelruzicka/.ollama/models/blobs/sha256-ece5e659647a20a5c28ab9eea1c12a1ad430bc0f2a27021d00ad103b3bf5206f --threads 4 --parallel 1 --port 34163" time=2024-11-19T01:12:18.834+01:00 level=INFO source=sched.go:449 msg="loaded runners" count=1 time=2024-11-19T01:12:18.834+01:00 level=INFO source=server.go:562 msg="waiting for llama runner to start responding" time=2024-11-19T01:12:18.834+01:00 level=INFO source=server.go:596 msg="waiting for server to become available" status="llm server error" time=2024-11-19T01:12:18.877+01:00 level=INFO source=runner.go:863 msg="starting go runner" time=2024-11-19T01:12:18.877+01:00 level=INFO source=runner.go:864 msg=system info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | RISCV_VECT = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | cgo(gcc)" threads=4 time=2024-11-19T01:12:18.877+01:00 level=INFO source=.:0 msg="Server listening on 127.0.0.1:34163" llama_model_loader: loaded meta data with 27 key-value pairs and 396 tensors from /home/pavelruzicka/.ollama/models/blobs/sha256-7ef0839fb71fbab13fda97c1b9819ffd99c799ba4f93d421ae1e2a46d68c5fa6 (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = mllama llama_model_loader: - kv 1: general.type str = model llama_model_loader: - kv 2: general.name str = Model llama_model_loader: - kv 3: general.size_label str = 10B llama_model_loader: - kv 4: mllama.block_count u32 = 40 llama_model_loader: - kv 5: mllama.context_length u32 = 131072 llama_model_loader: - kv 6: mllama.embedding_length u32 = 4096 llama_model_loader: - kv 7: mllama.feed_forward_length u32 = 14336 llama_model_loader: - kv 8: mllama.attention.head_count u32 = 32 llama_model_loader: - kv 9: mllama.attention.head_count_kv u32 = 8 llama_model_loader: - kv 10: mllama.rope.freq_base f32 = 500000.000000 llama_model_loader: - kv 11: mllama.attention.layer_norm_rms_epsilon f32 = 0.000010 llama_model_loader: - kv 12: general.file_type u32 = 7 llama_model_loader: - kv 13: mllama.vocab_size u32 = 128256 llama_model_loader: - kv 14: mllama.rope.dimension_count u32 = 128 llama_model_loader: - kv 15: mllama.attention.cross_attention_layers arr[i32,8] = [3, 8, 13, 18, 23, 28, 33, 38] llama_model_loader: - kv 16: tokenizer.ggml.add_bos_token bool = true llama_model_loader: - kv 17: tokenizer.ggml.model str = gpt2 llama_model_loader: - kv 18: tokenizer.ggml.pre str = llama-bpe llama_model_loader: - kv 19: tokenizer.ggml.tokens arr[str,128257] = ["!", "\"", "#", "$", "%", "&", "'", ... llama_model_loader: - kv 20: tokenizer.ggml.token_type arr[i32,128257] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... time=2024-11-19T01:12:19.086+01:00 level=INFO source=server.go:596 msg="waiting for server to become available" status="llm server loading model" llama_model_loader: - kv 21: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... llama_model_loader: - kv 22: tokenizer.ggml.bos_token_id u32 = 128000 llama_model_loader: - kv 23: tokenizer.ggml.eos_token_id u32 = 128009 llama_model_loader: - kv 24: tokenizer.ggml.padding_token_id u32 = 128004 llama_model_loader: - kv 25: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... llama_model_loader: - kv 26: general.quantization_version u32 = 2 llama_model_loader: - type f32: 114 tensors llama_model_loader: - type q8_0: 282 tensors llm_load_vocab: special tokens cache size = 257 llm_load_vocab: token to piece cache size = 0.7999 MB llm_load_print_meta: format = GGUF V3 (latest) llm_load_print_meta: arch = mllama llm_load_print_meta: vocab type = BPE llm_load_print_meta: n_vocab = 128256 llm_load_print_meta: n_merges = 280147 llm_load_print_meta: vocab_only = 0 llm_load_print_meta: n_ctx_train = 131072 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_layer = 40 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 8 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_swa = 0 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 4 llm_load_print_meta: n_embd_k_gqa = 1024 llm_load_print_meta: n_embd_v_gqa = 1024 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-05 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: f_logit_scale = 0.0e+00 llm_load_print_meta: n_ff = 14336 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: causal attn = 1 llm_load_print_meta: pooling type = 0 llm_load_print_meta: rope type = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 500000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_ctx_orig_yarn = 131072 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: ssm_d_conv = 0 llm_load_print_meta: ssm_d_inner = 0 llm_load_print_meta: ssm_d_state = 0 llm_load_print_meta: ssm_dt_rank = 0 llm_load_print_meta: ssm_dt_b_c_rms = 0 llm_load_print_meta: model type = 11B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 9.78 B llm_load_print_meta: model size = 9.67 GiB (8.50 BPW) llm_load_print_meta: general.name = Model llm_load_print_meta: BOS token = 128000 '<|begin_of_text|>' llm_load_print_meta: EOS token = 128009 '<|eot_id|>' llm_load_print_meta: PAD token = 128004 '<|finetune_right_pad_id|>' llm_load_print_meta: LF token = 128 'Ä' llm_load_print_meta: EOT token = 128009 '<|eot_id|>' llm_load_print_meta: EOM token = 128008 '<|eom_id|>' llm_load_print_meta: EOG token = 128008 '<|eom_id|>' llm_load_print_meta: EOG token = 128009 '<|eot_id|>' llm_load_print_meta: max token length = 256 llama_model_load: vocab mismatch 128256 !- 128257 ... ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes llm_load_tensors: ggml ctx size = 0.36 MiB llm_load_tensors: offloading 40 repeating layers to GPU llm_load_tensors: offloading non-repeating layers to GPU llm_load_tensors: offloaded 41/41 layers to GPU llm_load_tensors: CPU buffer size = 532.35 MiB llm_load_tensors: CUDA0 buffer size = 9373.60 MiB llama_new_context_with_model: n_ctx = 2048 llama_new_context_with_model: n_batch = 512 llama_new_context_with_model: n_ubatch = 512 llama_new_context_with_model: flash_attn = 0 llama_new_context_with_model: freq_base = 500000.0 llama_new_context_with_model: freq_scale = 1 llama_kv_cache_init: CUDA0 KV buffer size = 656.25 MiB llama_new_context_with_model: KV self size = 656.25 MiB, K (f16): 328.12 MiB, V (f16): 328.12 MiB llama_new_context_with_model: CUDA_Host output buffer size = 0.50 MiB llama_new_context_with_model: CUDA0 compute buffer size = 258.50 MiB llama_new_context_with_model: CUDA_Host compute buffer size = 12.01 MiB llama_new_context_with_model: graph nodes = 1030 llama_new_context_with_model: graph splits = 2 mllama_model_load: model name: Llama-3.2-11B-Vision-Instruct mllama_model_load: description: vision encoder for Mllama mllama_model_load: GGUF version: 3 mllama_model_load: alignment: 32 mllama_model_load: n_tensors: 512 mllama_model_load: n_kv: 17 mllama_model_load: ftype: f16 mllama_model_load: mllama_model_load: vision using CUDA backend mllama_model_load: compute allocated memory: 2853.34 MB time=2024-11-19T01:12:44.441+01:00 level=INFO source=server.go:601 msg="llama runner started in 25.61 seconds" ```
Author
Owner

@berezhinskiy commented on GitHub (Jan 27, 2025):

I got the same GGML_ASSERT error while utilizing the llama3.2-vision:11b-instruct-q8_0 model under the same conditions.

<!-- gh-comment-id:2615655034 --> @berezhinskiy commented on GitHub (Jan 27, 2025): I got the same `GGML_ASSERT` error while utilizing the `llama3.2-vision:11b-instruct-q8_0` model under the same conditions.
Author
Owner

@rick-github commented on GitHub (Jan 27, 2025):

If you can provide prompt and full server logs it will aid in debugging.

<!-- gh-comment-id:2615682978 --> @rick-github commented on GitHub (Jan 27, 2025): If you can provide prompt and full server logs it will aid in debugging.
Author
Owner

@anton-b commented on GitHub (Feb 12, 2025):

Having same issue:

ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA L40S, compute capability 8.9, VMM: yes
time=2025-02-12T15:42:39.343Z level=INFO source=runner.go:937 msg=system info="CUDA : ARCHS = 600,610,620,700,720,750,800,860,870,890,900 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | LLAMAFILE = 1 | AARCH64_REPACK = 1 | cgo(gcc)" threads=2
time=2025-02-12T15:42:39.343Z level=INFO source=.:0 msg="Server listening on 127.0.0.1:38761"
llama_load_model_from_file: using device CUDA0 (NVIDIA L40S) - 45055 MiB free
llama_model_loader: loaded meta data with 27 key-value pairs and 396 tensors from /home/gpu_runtime/.ollama/models/blobs/sha256-7ef0839fb71fbab13fda97c1b9819ffd99c799ba4f93d421ae1e2a46d68c5fa6 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = mllama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Model
llama_model_loader: - kv   3:                         general.size_label str              = 10B
llama_model_loader: - kv   4:                         mllama.block_count u32              = 40
llama_model_loader: - kv   5:                      mllama.context_length u32              = 131072
llama_model_loader: - kv   6:                    mllama.embedding_length u32              = 4096
llama_model_loader: - kv   7:                 mllama.feed_forward_length u32              = 14336
llama_model_loader: - kv   8:                mllama.attention.head_count u32              = 32
llama_model_loader: - kv   9:             mllama.attention.head_count_kv u32              = 8
llama_model_loader: - kv  10:                      mllama.rope.freq_base f32              = 500000.000000
llama_model_loader: - kv  11:    mllama.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  12:                          general.file_type u32              = 7
llama_model_loader: - kv  13:                          mllama.vocab_size u32              = 128256
llama_model_loader: - kv  14:                mllama.rope.dimension_count u32              = 128
llama_model_loader: - kv  15:    mllama.attention.cross_attention_layers arr[i32,8]       = [3, 8, 13, 18, 23, 28, 33, 38]
llama_model_loader: - kv  16:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  17:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  18:                         tokenizer.ggml.pre str              = llama-bpe
llama_model_loader: - kv  19:                      tokenizer.ggml.tokens arr[str,128257]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  20:                  tokenizer.ggml.token_type arr[i32,128257]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
time=2025-02-12T15:42:39.547Z level=INFO source=server.go:589 msg="waiting for server to become available" status="llm server loading model"
llama_model_loader: - kv  21:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
llama_model_loader: - kv  22:                tokenizer.ggml.bos_token_id u32              = 128000
llama_model_loader: - kv  23:                tokenizer.ggml.eos_token_id u32              = 128009
llama_model_loader: - kv  24:            tokenizer.ggml.padding_token_id u32              = 128004
llama_model_loader: - kv  25:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
llama_model_loader: - kv  26:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:  114 tensors
llama_model_loader: - type q8_0:  282 tensors
llm_load_vocab: special tokens cache size = 257
llm_load_vocab: token to piece cache size = 0.7999 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = mllama
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 128256
llm_load_print_meta: n_merges         = 280147
llm_load_print_meta: vocab_only       = 0
llm_load_print_meta: n_ctx_train      = 131072
llm_load_print_meta: n_embd           = 4096
llm_load_print_meta: n_layer          = 40
llm_load_print_meta: n_head           = 32
llm_load_print_meta: n_head_kv        = 8
llm_load_print_meta: n_rot            = 128
llm_load_print_meta: n_swa            = 0
llm_load_print_meta: n_embd_head_k    = 128
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 4
llm_load_print_meta: n_embd_k_gqa     = 1024
llm_load_print_meta: n_embd_v_gqa     = 1024
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
llm_load_print_meta: f_clamp_kqv      = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale    = 0.0e+00
llm_load_print_meta: n_ff             = 14336
llm_load_print_meta: n_expert         = 0
llm_load_print_meta: n_expert_used    = 0
llm_load_print_meta: causal attn      = 1
llm_load_print_meta: pooling type     = 0
llm_load_print_meta: rope type        = 0
llm_load_print_meta: rope scaling     = linear
llm_load_print_meta: freq_base_train  = 500000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_ctx_orig_yarn  = 131072
llm_load_print_meta: rope_finetuned   = unknown
llm_load_print_meta: ssm_d_conv       = 0
llm_load_print_meta: ssm_d_inner      = 0
llm_load_print_meta: ssm_d_state      = 0
llm_load_print_meta: ssm_dt_rank      = 0
llm_load_print_meta: ssm_dt_b_c_rms   = 0
llm_load_print_meta: model type       = 11B
llm_load_print_meta: model ftype      = Q8_0
llm_load_print_meta: model params     = 9.78 B
llm_load_print_meta: model size       = 9.67 GiB (8.50 BPW)
llm_load_print_meta: general.name     = Model
llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
llm_load_print_meta: EOM token        = 128008 '<|eom_id|>'
llm_load_print_meta: PAD token        = 128004 '<|finetune_right_pad_id|>'
llm_load_print_meta: LF token         = 128 'Ä'
llm_load_print_meta: EOG token        = 128008 '<|eom_id|>'
llm_load_print_meta: EOG token        = 128009 '<|eot_id|>'
llm_load_print_meta: max token length = 256
llama_model_load: vocab mismatch 128256 !- 128257 ...
[GIN] 2025/02/12 - 15:42:40 | 200 |        30.3µs |       127.0.0.1 | HEAD     "/"
[GIN] 2025/02/12 - 15:42:40 | 200 |      33.351µs |       127.0.0.1 | GET      "/api/ps"
llm_load_tensors: offloading 40 repeating layers to GPU
llm_load_tensors: offloading output layer to GPU
llm_load_tensors: offloaded 41/41 layers to GPU
llm_load_tensors:   CPU_Mapped model buffer size =   532.35 MiB
llm_load_tensors:        CUDA0 model buffer size =  9373.59 MiB
llama_new_context_with_model: n_seq_max     = 1
llama_new_context_with_model: n_ctx         = 16384
llama_new_context_with_model: n_ctx_per_seq = 16384
llama_new_context_with_model: n_batch       = 512
llama_new_context_with_model: n_ubatch      = 512
llama_new_context_with_model: flash_attn    = 0
llama_new_context_with_model: freq_base     = 500000.0
llama_new_context_with_model: freq_scale    = 1
llama_new_context_with_model: n_ctx_per_seq (16384) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_init: kv_size = 16384, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 40, can_shift = 1
llama_kv_cache_init:      CUDA0 KV buffer size =  2448.25 MiB
llama_new_context_with_model: KV self size  = 2448.25 MiB, K (f16): 1224.12 MiB, V (f16): 1224.12 MiB
llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
llama_new_context_with_model: graph nodes  = 1262
llama_new_context_with_model: graph splits = 2
mllama_model_load: model name:   Llama-3.2-11B-Vision-Instruct
mllama_model_load: description:  vision encoder for Mllama
mllama_model_load: GGUF version: 3
mllama_model_load: alignment:    32
mllama_model_load: n_tensors:    512
mllama_model_load: n_kv:         17
mllama_model_load: ftype:        f16
mllama_model_load:
mllama_model_load: vision using CUDA backend
[GIN] 2025/02/12 - 15:42:41 | 200 |      30.161µs |       127.0.0.1 | HEAD     "/"
[GIN] 2025/02/12 - 15:42:41 | 200 |      40.801µs |       127.0.0.1 | GET      "/api/ps"
mllama_model_load: compute allocated memory: 2853.34 MB
time=2025-02-12T15:42:42.306Z level=INFO source=server.go:594 msg="llama runner started in 3.01 seconds"
llama_model_loader: loaded meta data with 27 key-value pairs and 396 tensors from /home/gpu_runtime/.ollama/models/blobs/sha256-7ef0839fb71fbab13fda97c1b9819ffd99c799ba4f93d421ae1e2a46d68c5fa6 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = mllama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Model
llama_model_loader: - kv   3:                         general.size_label str              = 10B
llama_model_loader: - kv   4:                         mllama.block_count u32              = 40
llama_model_loader: - kv   5:                      mllama.context_length u32              = 131072
llama_model_loader: - kv   6:                    mllama.embedding_length u32              = 4096
llama_model_loader: - kv   7:                 mllama.feed_forward_length u32              = 14336
llama_model_loader: - kv   8:                mllama.attention.head_count u32              = 32
llama_model_loader: - kv   9:             mllama.attention.head_count_kv u32              = 8
llama_model_loader: - kv  10:                      mllama.rope.freq_base f32              = 500000.000000
llama_model_loader: - kv  11:    mllama.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  12:                          general.file_type u32              = 7
llama_model_loader: - kv  13:                          mllama.vocab_size u32              = 128256
llama_model_loader: - kv  14:                mllama.rope.dimension_count u32              = 128
llama_model_loader: - kv  15:    mllama.attention.cross_attention_layers arr[i32,8]       = [3, 8, 13, 18, 23, 28, 33, 38]
llama_model_loader: - kv  16:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  17:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  18:                         tokenizer.ggml.pre str              = llama-bpe
llama_model_loader: - kv  19:                      tokenizer.ggml.tokens arr[str,128257]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  20:                  tokenizer.ggml.token_type arr[i32,128257]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  21:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "��Ġ ĠĠ", "...
llama_model_loader: - kv  22:                tokenizer.ggml.bos_token_id u32              = 128000
llama_model_loader: - kv  23:                tokenizer.ggml.eos_token_id u32              = 128009
llama_model_loader: - kv  24:            tokenizer.ggml.padding_token_id u32              = 128004
llama_model_loader: - kv  25:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
llama_model_loader: - kv  26:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:  114 tensors
llama_model_loader: - type q8_0:  282 tensors
llm_load_vocab: special tokens cache size = 257
llm_load_vocab: token to piece cache size = 0.7999 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = mllama
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 128256
llm_load_print_meta: n_merges         = 280147
llm_load_print_meta: vocab_only       = 1
llm_load_print_meta: model type       = ?B
llm_load_print_meta: model ftype      = all F32
llm_load_print_meta: model params     = 9.78 B
llm_load_print_meta: model size       = 9.67 GiB (8.50 BPW)
llm_load_print_meta: general.name     = Model
llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
llm_load_print_meta: EOM token        = 128008 '<|eom_id|>'
llm_load_print_meta: PAD token        = 128004 '<|finetune_right_pad_id|>'
llm_load_print_meta: LF token         = 128 'Ä'
llm_load_print_meta: EOG token        = 128008 '<|eom_id|>'
llm_load_print_meta: EOG token        = 128009 '<|eot_id|>'
llm_load_print_meta: max token length = 256
llama_model_load: vocab mismatch 128256 !- 128257 ...
llama_model_load: vocab only - skipping tensors
[GIN] 2025/02/12 - 15:42:49 | 200 | 16.104305385s | 192.168.114.106 | POST     "/api/chat"
time=2025-02-12T15:42:50.097Z level=WARN source=sched.go:137 msg="mllama doesn't support parallel requests yet"
ggml.c:1600: GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)) failed
SIGSEGV: segmentation violation
PC=0x7ff2dadefe57 m=5 sigcode=1 addr=0x206803fcc
signal arrived during cgo execution

goroutine 19 gp=0xc00008c540 m=5 mp=0xc000088008 [syscall]:
runtime.cgocall(0x55c065cdcfb0, 0xc000061ba0)
        runtime/cgocall.go:167 +0x4b fp=0xc000061b78 sp=0xc000061b40 pc=0x55c065a914eb
github.com/ollama/ollama/llama._Cfunc_llama_decode(0x7ff258c39300, {0x1, 0x7ff258d88d00, 0x0, 0x0, 0x7ff258b81890, 0x7ff258b820a0, 0x7ff258b18630, 0x7ff259ae3550})
        _cgo_gotypes.go:556 +0x4f fp=0xc000061ba0 sp=0xc000061b78 pc=0x55c065b3b56f
github.com/ollama/ollama/llama.(*Context).Decode.func1(0x55c065cd86eb?, 0x7ff258c39300?)
        github.com/ollama/ollama/llama/llama.go:207 +0xf5 fp=0xc000061c90 sp=0xc000061ba0 pc=0x55c065b3dd95
github.com/ollama/ollama/llama.(*Context).Decode(0x55c066368320?, 0x0?)
        github.com/ollama/ollama/llama/llama.go:207 +0x13 fp=0xc000061cd8 sp=0xc000061c90 pc=0x55c065b3dc13
github.com/ollama/ollama/llama/runner.(*Server).processBatch(0xc0000ac000, 0xc0001ac0c0, 0xc0001ac120)
        github.com/ollama/ollama/llama/runner/runner.go:434 +0x23f fp=0xc000061ee0 sp=0xc000061cd8 pc=0x55c065cd74df
github.com/ollama/ollama/llama/runner.(*Server).run(0xc0000ac000, {0x55c0660d80c0, 0xc0000aa050})
        github.com/ollama/ollama/llama/runner/runner.go:342 +0x1d5 fp=0xc000061fb8 sp=0xc000061ee0 pc=0x55c065cd6f15
github.com/ollama/ollama/llama/runner.Execute.gowrap2()
        github.com/ollama/ollama/llama/runner/runner.go:975 +0x28 fp=0xc000061fe0 sp=0xc000061fb8 pc=0x55c065cdbe08
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc000061fe8 sp=0xc000061fe0 pc=0x55c065a9ef21
created by github.com/ollama/ollama/llama/runner.Execute in goroutine 1
        github.com/ollama/ollama/llama/runner/runner.go:975 +0xde5

goroutine 1 gp=0xc0000061c0 m=nil [IO wait]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc00013b7b0 sp=0xc00013b790 pc=0x55c065a972ee
runtime.netpollblock(0xc000031800?, 0x65a2fb46?, 0xc0?)
        runtime/netpoll.go:575 +0xf7 fp=0xc00013b7e8 sp=0xc00013b7b0 pc=0x55c065a5c057
internal/poll.runtime_pollWait(0x7ff2d39c4e00, 0x72)
        runtime/netpoll.go:351 +0x85 fp=0xc00013b808 sp=0xc00013b7e8 pc=0x55c065a965e5
internal/poll.(*pollDesc).wait(0xc0000a6100?, 0x900000036?, 0x0)
        internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00013b830 sp=0xc00013b808 pc=0x55c065aec427
internal/poll.(*pollDesc).waitRead(...)
        internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Accept(0xc0000a6100)
        internal/poll/fd_unix.go:620 +0x295 fp=0xc00013b8d8 sp=0xc00013b830 pc=0x55c065aed995
net.(*netFD).accept(0xc0000a6100)
        net/fd_unix.go:172 +0x29 fp=0xc00013b990 sp=0xc00013b8d8 pc=0x55c065b66269
net.(*TCPListener).accept(0xc0000b0040)
        net/tcpsock_posix.go:159 +0x1e fp=0xc00013b9e0 sp=0xc00013b990 pc=0x55c065b768be
net.(*TCPListener).Accept(0xc0000b0040)
        net/tcpsock.go:372 +0x30 fp=0xc00013ba10 sp=0xc00013b9e0 pc=0x55c065b75bf0
net/http.(*onceCloseListener).Accept(0xc0000ac2d0?)
        <autogenerated>:1 +0x24 fp=0xc00013ba28 sp=0xc00013ba10 pc=0x55c065cb47c4
net/http.(*Server).Serve(0xc00009e3c0, {0x55c0660d7ad8, 0xc0000b0040})
        net/http/server.go:3330 +0x30c fp=0xc00013bb58 sp=0xc00013ba28 pc=0x55c065ca650c
github.com/ollama/ollama/llama/runner.Execute({0xc000016130?, 0x55c065a9eb7c?, 0x0?})
        github.com/ollama/ollama/llama/runner/runner.go:996 +0x11a9 fp=0xc00013bef8 sp=0xc00013bb58 pc=0x55c065cdbae9
main.main()
        github.com/ollama/ollama/cmd/runner/main.go:11 +0x54 fp=0xc00013bf50 sp=0xc00013bef8 pc=0x55c065cdca74
runtime.main()
        runtime/proc.go:272 +0x29d fp=0xc00013bfe0 sp=0xc00013bf50 pc=0x55c065a6363d
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00013bfe8 sp=0xc00013bfe0 pc=0x55c065a9ef21

goroutine 2 gp=0xc000006c40 m=nil [force gc (idle)]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc00004cfa8 sp=0xc00004cf88 pc=0x55c065a972ee
runtime.goparkunlock(...)
        runtime/proc.go:430
runtime.forcegchelper()
        runtime/proc.go:337 +0xb8 fp=0xc00004cfe0 sp=0xc00004cfa8 pc=0x55c065a63978
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00004cfe8 sp=0xc00004cfe0 pc=0x55c065a9ef21
created by runtime.init.7 in goroutine 1
        runtime/proc.go:325 +0x1a

goroutine 3 gp=0xc000007180 m=nil [GC sweep wait]:
runtime.gopark(0x1?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc00004d780 sp=0xc00004d760 pc=0x55c065a972ee
runtime.goparkunlock(...)
        runtime/proc.go:430
runtime.bgsweep(0xc000078000)
        runtime/mgcsweep.go:317 +0xdf fp=0xc00004d7c8 sp=0xc00004d780 pc=0x55c065a4e1ff
runtime.gcenable.gowrap1()
        runtime/mgc.go:204 +0x25 fp=0xc00004d7e0 sp=0xc00004d7c8 pc=0x55c065a42a65
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00004d7e8 sp=0xc00004d7e0 pc=0x55c065a9ef21
created by runtime.gcenable in goroutine 1
        runtime/mgc.go:204 +0x66

goroutine 4 gp=0xc000007340 m=nil [GC scavenge wait]:
runtime.gopark(0x6fa01?, 0xfc623?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc00004df78 sp=0xc00004df58 pc=0x55c065a972ee
runtime.goparkunlock(...)
        runtime/proc.go:430
runtime.(*scavengerState).park(0x55c0662c3080)
        runtime/mgcscavenge.go:425 +0x49 fp=0xc00004dfa8 sp=0xc00004df78 pc=0x55c065a4bbe9
runtime.bgscavenge(0xc000078000)
        runtime/mgcscavenge.go:658 +0x59 fp=0xc00004dfc8 sp=0xc00004dfa8 pc=0x55c065a4c179
runtime.gcenable.gowrap2()
        runtime/mgc.go:205 +0x25 fp=0xc00004dfe0 sp=0xc00004dfc8 pc=0x55c065a42a05
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00004dfe8 sp=0xc00004dfe0 pc=0x55c065a9ef21
created by runtime.gcenable in goroutine 1
        runtime/mgc.go:205 +0xa5

goroutine 5 gp=0xc000007c00 m=nil [finalizer wait]:
runtime.gopark(0x0?, 0x55c0660d30d8?, 0x30?, 0x0?, 0x1000000010?)
        runtime/proc.go:424 +0xce fp=0xc00004c620 sp=0xc00004c600 pc=0x55c065a972ee
runtime.runfinq()
        runtime/mfinal.go:193 +0x107 fp=0xc00004c7e0 sp=0xc00004c620 pc=0x55c065a41ae7
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00004c7e8 sp=0xc00004c7e0 pc=0x55c065a9ef21
created by runtime.createfing in goroutine 1
        runtime/mfinal.go:163 +0x3d

goroutine 6 gp=0xc000007dc0 m=nil [chan receive]:
runtime.gopark(0xc00004e760?, 0x55c065b4d785?, 0x70?, 0xa2?, 0x55c0660dbda0?)
        runtime/proc.go:424 +0xce fp=0xc00004e718 sp=0xc00004e6f8 pc=0x55c065a972ee
runtime.chanrecv(0xc0000201c0, 0x0, 0x1)
        runtime/chan.go:639 +0x41c fp=0xc00004e790 sp=0xc00004e718 pc=0x55c065a3273c
runtime.chanrecv1(0x0?, 0x0?)
        runtime/chan.go:489 +0x12 fp=0xc00004e7b8 sp=0xc00004e790 pc=0x55c065a32312
runtime.unique_runtime_registerUniqueMapCleanup.func1(...)
        runtime/mgc.go:1781
runtime.unique_runtime_registerUniqueMapCleanup.gowrap1()
        runtime/mgc.go:1784 +0x2f fp=0xc00004e7e0 sp=0xc00004e7b8 pc=0x55c065a458cf
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00004e7e8 sp=0xc00004e7e0 pc=0x55c065a9ef21
created by unique.runtime_registerUniqueMapCleanup in goroutine 1
        runtime/mgc.go:1779 +0x96

goroutine 24 gp=0xc00008cc40 m=nil [select]:
runtime.gopark(0xc000139a68?, 0x2?, 0xd?, 0x77?, 0xc000139834?)
        runtime/proc.go:424 +0xce fp=0xc0001396a0 sp=0xc000139680 pc=0x55c065a972ee
runtime.selectgo(0xc000139a68, 0xc000139830, 0xc0000b7100?, 0x0, 0x1?, 0x1)
        runtime/select.go:335 +0x7a5 fp=0xc0001397c8 sp=0xc0001396a0 pc=0x55c065a75545
github.com/ollama/ollama/llama/runner.(*Server).completion(0xc0000ac000, {0x55c0660d7c58, 0xc0001a20e0}, 0xc000134280)
        github.com/ollama/ollama/llama/runner/runner.go:687 +0xa86 fp=0xc000139ac0 sp=0xc0001397c8 pc=0x55c065cd9206
github.com/ollama/ollama/llama/runner.(*Server).completion-fm({0x55c0660d7c58?, 0xc0001a20e0?}, 0x55c065caa807?)
        <autogenerated>:1 +0x36 fp=0xc000139af0 sp=0xc000139ac0 pc=0x55c065cdc6b6
net/http.HandlerFunc.ServeHTTP(0xc0000c6000?, {0x55c0660d7c58?, 0xc0001a20e0?}, 0x0?)
        net/http/server.go:2220 +0x29 fp=0xc000139b18 sp=0xc000139af0 pc=0x55c065ca33c9
net/http.(*ServeMux).ServeHTTP(0x55c065a38f65?, {0x55c0660d7c58, 0xc0001a20e0}, 0xc000134280)
        net/http/server.go:2747 +0x1ca fp=0xc000139b68 sp=0xc000139b18 pc=0x55c065ca526a
net/http.serverHandler.ServeHTTP({0x55c0660d6d10?}, {0x55c0660d7c58?, 0xc0001a20e0?}, 0x6?)
        net/http/server.go:3210 +0x8e fp=0xc000139b98 sp=0xc000139b68 pc=0x55c065cac16e
net/http.(*conn).serve(0xc0000ac2d0, {0x55c0660d8088, 0xc0000a0300})
        net/http/server.go:2092 +0x5d0 fp=0xc000139fb8 sp=0xc000139b98 pc=0x55c065ca1ff0
net/http.(*Server).Serve.gowrap3()
        net/http/server.go:3360 +0x28 fp=0xc000139fe0 sp=0xc000139fb8 pc=0x55c065ca6908
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc000139fe8 sp=0xc000139fe0 pc=0x55c065a9ef21
created by net/http.(*Server).Serve in goroutine 1
        net/http/server.go:3360 +0x485

goroutine 57 gp=0xc0001a6000 m=nil [IO wait]:
runtime.gopark(0x55c065a3d445?, 0x0?, 0x0?, 0x87?, 0xb?)
        runtime/proc.go:424 +0xce fp=0xc0000485a8 sp=0xc000048588 pc=0x55c065a972ee
runtime.netpollblock(0x55c065ad2b18?, 0x65a2fb46?, 0xc0?)
        runtime/netpoll.go:575 +0xf7 fp=0xc0000485e0 sp=0xc0000485a8 pc=0x55c065a5c057
internal/poll.runtime_pollWait(0x7ff2d39c4ce8, 0x72)
        runtime/netpoll.go:351 +0x85 fp=0xc000048600 sp=0xc0000485e0 pc=0x55c065a965e5
internal/poll.(*pollDesc).wait(0xc0000a6200?, 0xc0000a06a1?, 0x0)
        internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc000048628 sp=0xc000048600 pc=0x55c065aec427
internal/poll.(*pollDesc).waitRead(...)
        internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0xc0000a6200, {0xc0000a06a1, 0x1, 0x1})
        internal/poll/fd_unix.go:165 +0x27a fp=0xc0000486c0 sp=0xc000048628 pc=0x55c065aecf7a
net.(*netFD).Read(0xc0000a6200, {0xc0000a06a1?, 0xc000048748?, 0x55c065a98990?})
        net/fd_posix.go:55 +0x25 fp=0xc000048708 sp=0xc0000486c0 pc=0x55c065b65185
net.(*conn).Read(0xc0000a8050, {0xc0000a06a1?, 0x0?, 0x55c066368320?})
        net/net.go:189 +0x45 fp=0xc000048750 sp=0xc000048708 pc=0x55c065b6eb85
net.(*TCPConn).Read(0x55c066283d90?, {0xc0000a06a1?, 0x0?, 0x0?})
        <autogenerated>:1 +0x25 fp=0xc000048780 sp=0xc000048750 pc=0x55c065b7bc25
net/http.(*connReader).backgroundRead(0xc0000a0690)
        net/http/server.go:690 +0x37 fp=0xc0000487c8 sp=0xc000048780 pc=0x55c065c9c977
net/http.(*connReader).startBackgroundRead.gowrap2()
        net/http/server.go:686 +0x25 fp=0xc0000487e0 sp=0xc0000487c8 pc=0x55c065c9c8a5
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc0000487e8 sp=0xc0000487e0 pc=0x55c065a9ef21
created by net/http.(*connReader).startBackgroundRead in goroutine 24
        net/http/server.go:686 +0xb6

goroutine 41 gp=0xc0002301c0 m=nil [GC worker (idle)]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc0001ae738 sp=0xc0001ae718 pc=0x55c065a972ee
runtime.gcBgMarkWorker(0xc00022a380)
        runtime/mgc.go:1412 +0xe9 fp=0xc0001ae7c8 sp=0xc0001ae738 pc=0x55c065a44bc9
runtime.gcBgMarkStartWorkers.gowrap1()
        runtime/mgc.go:1328 +0x25 fp=0xc0001ae7e0 sp=0xc0001ae7c8 pc=0x55c065a44aa5
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc0001ae7e8 sp=0xc0001ae7e0 pc=0x55c065a9ef21
created by runtime.gcBgMarkStartWorkers in goroutine 24
        runtime/mgc.go:1328 +0x105

goroutine 42 gp=0xc000230380 m=nil [GC worker (idle)]:
runtime.gopark(0x5207922bbd5d?, 0x1?, 0x36?, 0x1?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc0001aef38 sp=0xc0001aef18 pc=0x55c065a972ee
runtime.gcBgMarkWorker(0xc00022a380)
        runtime/mgc.go:1412 +0xe9 fp=0xc0001aefc8 sp=0xc0001aef38 pc=0x55c065a44bc9
runtime.gcBgMarkStartWorkers.gowrap1()
        runtime/mgc.go:1328 +0x25 fp=0xc0001aefe0 sp=0xc0001aefc8 pc=0x55c065a44aa5
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc0001aefe8 sp=0xc0001aefe0 pc=0x55c065a9ef21
created by runtime.gcBgMarkStartWorkers in goroutine 24
        runtime/mgc.go:1328 +0x105

goroutine 43 gp=0xc000230540 m=nil [GC worker (idle)]:
runtime.gopark(0x5207922c8bbc?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc0001af738 sp=0xc0001af718 pc=0x55c065a972ee
runtime.gcBgMarkWorker(0xc00022a380)
        runtime/mgc.go:1412 +0xe9 fp=0xc0001af7c8 sp=0xc0001af738 pc=0x55c065a44bc9
runtime.gcBgMarkStartWorkers.gowrap1()
        runtime/mgc.go:1328 +0x25 fp=0xc0001af7e0 sp=0xc0001af7c8 pc=0x55c065a44aa5
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc0001af7e8 sp=0xc0001af7e0 pc=0x55c065a9ef21
created by runtime.gcBgMarkStartWorkers in goroutine 24
        runtime/mgc.go:1328 +0x105

goroutine 44 gp=0xc000230700 m=nil [GC worker (idle)]:
runtime.gopark(0x5207922b5985?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc0001aff38 sp=0xc0001aff18 pc=0x55c065a972ee
runtime.gcBgMarkWorker(0xc00022a380)
        runtime/mgc.go:1412 +0xe9 fp=0xc0001affc8 sp=0xc0001aff38 pc=0x55c065a44bc9
runtime.gcBgMarkStartWorkers.gowrap1()
        runtime/mgc.go:1328 +0x25 fp=0xc0001affe0 sp=0xc0001affc8 pc=0x55c065a44aa5
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc0001affe8 sp=0xc0001affe0 pc=0x55c065a9ef21
created by runtime.gcBgMarkStartWorkers in goroutine 24
        runtime/mgc.go:1328 +0x105

rax    0x206803fcc
rbx    0x7ff25816e110
rcx    0xff3
rdx    0x7ff258006840
rdi    0x7ff258006850
rsi    0x0
rbp    0x7ff26cf49d50
rsp    0x7ff26cf49d30
r8     0x7ff25801f948
r9     0x0
r10    0x0
r11    0x246
r12    0x7ff26008e130
r13    0x7ff258006850
r14    0x0
r15    0x7ff32697de40
rip    0x7ff2dadefe57
rflags 0x10297
cs     0x33
fs     0x0
gs     0x0
SIGABRT: abort
PC=0x7ff2b560eb1c m=5 sigcode=18446744073709551610
signal arrived during cgo execution

goroutine 19 gp=0xc00008c540 m=5 mp=0xc000088008 [syscall]:
runtime.cgocall(0x55c065cdcfb0, 0xc000061ba0)
        runtime/cgocall.go:167 +0x4b fp=0xc000061b78 sp=0xc000061b40 pc=0x55c065a914eb
github.com/ollama/ollama/llama._Cfunc_llama_decode(0x7ff258c39300, {0x1, 0x7ff258d88d00, 0x0, 0x0, 0x7ff258b81890, 0x7ff258b820a0, 0x7ff258b18630, 0x7ff259ae3550})
        _cgo_gotypes.go:556 +0x4f fp=0xc000061ba0 sp=0xc000061b78 pc=0x55c065b3b56f
github.com/ollama/ollama/llama.(*Context).Decode.func1(0x55c065cd86eb?, 0x7ff258c39300?)
        github.com/ollama/ollama/llama/llama.go:207 +0xf5 fp=0xc000061c90 sp=0xc000061ba0 pc=0x55c065b3dd95
github.com/ollama/ollama/llama.(*Context).Decode(0x55c066368320?, 0x0?)
        github.com/ollama/ollama/llama/llama.go:207 +0x13 fp=0xc000061cd8 sp=0xc000061c90 pc=0x55c065b3dc13
github.com/ollama/ollama/llama/runner.(*Server).processBatch(0xc0000ac000, 0xc0001ac0c0, 0xc0001ac120)
        github.com/ollama/ollama/llama/runner/runner.go:434 +0x23f fp=0xc000061ee0 sp=0xc000061cd8 pc=0x55c065cd74df
github.com/ollama/ollama/llama/runner.(*Server).run(0xc0000ac000, {0x55c0660d80c0, 0xc0000aa050})
        github.com/ollama/ollama/llama/runner/runner.go:342 +0x1d5 fp=0xc000061fb8 sp=0xc000061ee0 pc=0x55c065cd6f15
github.com/ollama/ollama/llama/runner.Execute.gowrap2()
        github.com/ollama/ollama/llama/runner/runner.go:975 +0x28 fp=0xc000061fe0 sp=0xc000061fb8 pc=0x55c065cdbe08
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc000061fe8 sp=0xc000061fe0 pc=0x55c065a9ef21
created by github.com/ollama/ollama/llama/runner.Execute in goroutine 1
        github.com/ollama/ollama/llama/runner/runner.go:975 +0xde5

goroutine 1 gp=0xc0000061c0 m=nil [IO wait]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc00013b7b0 sp=0xc00013b790 pc=0x55c065a972ee
runtime.netpollblock(0xc000031800?, 0x65a2fb46?, 0xc0?)
        runtime/netpoll.go:575 +0xf7 fp=0xc00013b7e8 sp=0xc00013b7b0 pc=0x55c065a5c057
internal/poll.runtime_pollWait(0x7ff2d39c4e00, 0x72)
        runtime/netpoll.go:351 +0x85 fp=0xc00013b808 sp=0xc00013b7e8 pc=0x55c065a965e5
internal/poll.(*pollDesc).wait(0xc0000a6100?, 0x900000036?, 0x0)
        internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00013b830 sp=0xc00013b808 pc=0x55c065aec427
internal/poll.(*pollDesc).waitRead(...)
        internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Accept(0xc0000a6100)
        internal/poll/fd_unix.go:620 +0x295 fp=0xc00013b8d8 sp=0xc00013b830 pc=0x55c065aed995
net.(*netFD).accept(0xc0000a6100)
        net/fd_unix.go:172 +0x29 fp=0xc00013b990 sp=0xc00013b8d8 pc=0x55c065b66269
net.(*TCPListener).accept(0xc0000b0040)
        net/tcpsock_posix.go:159 +0x1e fp=0xc00013b9e0 sp=0xc00013b990 pc=0x55c065b768be
net.(*TCPListener).Accept(0xc0000b0040)
        net/tcpsock.go:372 +0x30 fp=0xc00013ba10 sp=0xc00013b9e0 pc=0x55c065b75bf0
net/http.(*onceCloseListener).Accept(0xc0000ac2d0?)
        <autogenerated>:1 +0x24 fp=0xc00013ba28 sp=0xc00013ba10 pc=0x55c065cb47c4
net/http.(*Server).Serve(0xc00009e3c0, {0x55c0660d7ad8, 0xc0000b0040})
        net/http/server.go:3330 +0x30c fp=0xc00013bb58 sp=0xc00013ba28 pc=0x55c065ca650c
github.com/ollama/ollama/llama/runner.Execute({0xc000016130?, 0x55c065a9eb7c?, 0x0?})
        github.com/ollama/ollama/llama/runner/runner.go:996 +0x11a9 fp=0xc00013bef8 sp=0xc00013bb58 pc=0x55c065cdbae9
main.main()
        github.com/ollama/ollama/cmd/runner/main.go:11 +0x54 fp=0xc00013bf50 sp=0xc00013bef8 pc=0x55c065cdca74
runtime.main()
        runtime/proc.go:272 +0x29d fp=0xc00013bfe0 sp=0xc00013bf50 pc=0x55c065a6363d
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00013bfe8 sp=0xc00013bfe0 pc=0x55c065a9ef21

goroutine 2 gp=0xc000006c40 m=nil [force gc (idle)]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc00004cfa8 sp=0xc00004cf88 pc=0x55c065a972ee
runtime.goparkunlock(...)
        runtime/proc.go:430
runtime.forcegchelper()
        runtime/proc.go:337 +0xb8 fp=0xc00004cfe0 sp=0xc00004cfa8 pc=0x55c065a63978
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00004cfe8 sp=0xc00004cfe0 pc=0x55c065a9ef21
created by runtime.init.7 in goroutine 1
        runtime/proc.go:325 +0x1a

goroutine 3 gp=0xc000007180 m=nil [GC sweep wait]:
runtime.gopark(0x1?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc00004d780 sp=0xc00004d760 pc=0x55c065a972ee
runtime.goparkunlock(...)
        runtime/proc.go:430
runtime.bgsweep(0xc000078000)
        runtime/mgcsweep.go:317 +0xdf fp=0xc00004d7c8 sp=0xc00004d780 pc=0x55c065a4e1ff
runtime.gcenable.gowrap1()
        runtime/mgc.go:204 +0x25 fp=0xc00004d7e0 sp=0xc00004d7c8 pc=0x55c065a42a65
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00004d7e8 sp=0xc00004d7e0 pc=0x55c065a9ef21
created by runtime.gcenable in goroutine 1
        runtime/mgc.go:204 +0x66

goroutine 4 gp=0xc000007340 m=nil [GC scavenge wait]:
runtime.gopark(0x6fa01?, 0xfc623?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc00004df78 sp=0xc00004df58 pc=0x55c065a972ee
runtime.goparkunlock(...)
        runtime/proc.go:430
runtime.(*scavengerState).park(0x55c0662c3080)
        runtime/mgcscavenge.go:425 +0x49 fp=0xc00004dfa8 sp=0xc00004df78 pc=0x55c065a4bbe9
runtime.bgscavenge(0xc000078000)
        runtime/mgcscavenge.go:658 +0x59 fp=0xc00004dfc8 sp=0xc00004dfa8 pc=0x55c065a4c179
runtime.gcenable.gowrap2()
        runtime/mgc.go:205 +0x25 fp=0xc00004dfe0 sp=0xc00004dfc8 pc=0x55c065a42a05
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00004dfe8 sp=0xc00004dfe0 pc=0x55c065a9ef21
created by runtime.gcenable in goroutine 1
        runtime/mgc.go:205 +0xa5

goroutine 5 gp=0xc000007c00 m=nil [finalizer wait]:
runtime.gopark(0x0?, 0x55c0660d30d8?, 0x30?, 0x0?, 0x1000000010?)
        runtime/proc.go:424 +0xce fp=0xc00004c620 sp=0xc00004c600 pc=0x55c065a972ee
runtime.runfinq()
        runtime/mfinal.go:193 +0x107 fp=0xc00004c7e0 sp=0xc00004c620 pc=0x55c065a41ae7
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00004c7e8 sp=0xc00004c7e0 pc=0x55c065a9ef21
created by runtime.createfing in goroutine 1
        runtime/mfinal.go:163 +0x3d

goroutine 6 gp=0xc000007dc0 m=nil [chan receive]:
runtime.gopark(0xc00004e760?, 0x55c065b4d785?, 0x70?, 0xa2?, 0x55c0660dbda0?)
        runtime/proc.go:424 +0xce fp=0xc00004e718 sp=0xc00004e6f8 pc=0x55c065a972ee
runtime.chanrecv(0xc0000201c0, 0x0, 0x1)
        runtime/chan.go:639 +0x41c fp=0xc00004e790 sp=0xc00004e718 pc=0x55c065a3273c
runtime.chanrecv1(0x0?, 0x0?)
        runtime/chan.go:489 +0x12 fp=0xc00004e7b8 sp=0xc00004e790 pc=0x55c065a32312
runtime.unique_runtime_registerUniqueMapCleanup.func1(...)
        runtime/mgc.go:1781
runtime.unique_runtime_registerUniqueMapCleanup.gowrap1()
        runtime/mgc.go:1784 +0x2f fp=0xc00004e7e0 sp=0xc00004e7b8 pc=0x55c065a458cf
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc00004e7e8 sp=0xc00004e7e0 pc=0x55c065a9ef21
created by unique.runtime_registerUniqueMapCleanup in goroutine 1
        runtime/mgc.go:1779 +0x96

goroutine 24 gp=0xc00008cc40 m=nil [select]:
runtime.gopark(0xc000139a68?, 0x2?, 0xd?, 0x77?, 0xc000139834?)
        runtime/proc.go:424 +0xce fp=0xc0001396a0 sp=0xc000139680 pc=0x55c065a972ee
runtime.selectgo(0xc000139a68, 0xc000139830, 0xc0000b7100?, 0x0, 0x1?, 0x1)
        runtime/select.go:335 +0x7a5 fp=0xc0001397c8 sp=0xc0001396a0 pc=0x55c065a75545
github.com/ollama/ollama/llama/runner.(*Server).completion(0xc0000ac000, {0x55c0660d7c58, 0xc0001a20e0}, 0xc000134280)
        github.com/ollama/ollama/llama/runner/runner.go:687 +0xa86 fp=0xc000139ac0 sp=0xc0001397c8 pc=0x55c065cd9206
github.com/ollama/ollama/llama/runner.(*Server).completion-fm({0x55c0660d7c58?, 0xc0001a20e0?}, 0x55c065caa807?)
        <autogenerated>:1 +0x36 fp=0xc000139af0 sp=0xc000139ac0 pc=0x55c065cdc6b6
net/http.HandlerFunc.ServeHTTP(0xc0000c6000?, {0x55c0660d7c58?, 0xc0001a20e0?}, 0x0?)
        net/http/server.go:2220 +0x29 fp=0xc000139b18 sp=0xc000139af0 pc=0x55c065ca33c9
net/http.(*ServeMux).ServeHTTP(0x55c065a38f65?, {0x55c0660d7c58, 0xc0001a20e0}, 0xc000134280)
        net/http/server.go:2747 +0x1ca fp=0xc000139b68 sp=0xc000139b18 pc=0x55c065ca526a
net/http.serverHandler.ServeHTTP({0x55c0660d6d10?}, {0x55c0660d7c58?, 0xc0001a20e0?}, 0x6?)
        net/http/server.go:3210 +0x8e fp=0xc000139b98 sp=0xc000139b68 pc=0x55c065cac16e
net/http.(*conn).serve(0xc0000ac2d0, {0x55c0660d8088, 0xc0000a0300})
        net/http/server.go:2092 +0x5d0 fp=0xc000139fb8 sp=0xc000139b98 pc=0x55c065ca1ff0
net/http.(*Server).Serve.gowrap3()
        net/http/server.go:3360 +0x28 fp=0xc000139fe0 sp=0xc000139fb8 pc=0x55c065ca6908
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc000139fe8 sp=0xc000139fe0 pc=0x55c065a9ef21
created by net/http.(*Server).Serve in goroutine 1
        net/http/server.go:3360 +0x485

goroutine 57 gp=0xc0001a6000 m=nil [IO wait]:
runtime.gopark(0x55c065a3d445?, 0x0?, 0x0?, 0x87?, 0xb?)
        runtime/proc.go:424 +0xce fp=0xc0000485a8 sp=0xc000048588 pc=0x55c065a972ee
runtime.netpollblock(0x55c065ad2b18?, 0x65a2fb46?, 0xc0?)
        runtime/netpoll.go:575 +0xf7 fp=0xc0000485e0 sp=0xc0000485a8 pc=0x55c065a5c057
internal/poll.runtime_pollWait(0x7ff2d39c4ce8, 0x72)
        runtime/netpoll.go:351 +0x85 fp=0xc000048600 sp=0xc0000485e0 pc=0x55c065a965e5
internal/poll.(*pollDesc).wait(0xc0000a6200?, 0xc0000a06a1?, 0x0)
        internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc000048628 sp=0xc000048600 pc=0x55c065aec427
internal/poll.(*pollDesc).waitRead(...)
        internal/poll/fd_poll_runtime.go:89
internal/poll.(*FD).Read(0xc0000a6200, {0xc0000a06a1, 0x1, 0x1})
        internal/poll/fd_unix.go:165 +0x27a fp=0xc0000486c0 sp=0xc000048628 pc=0x55c065aecf7a
net.(*netFD).Read(0xc0000a6200, {0xc0000a06a1?, 0xc000048748?, 0x55c065a98990?})
        net/fd_posix.go:55 +0x25 fp=0xc000048708 sp=0xc0000486c0 pc=0x55c065b65185
net.(*conn).Read(0xc0000a8050, {0xc0000a06a1?, 0x0?, 0x55c066368320?})
        net/net.go:189 +0x45 fp=0xc000048750 sp=0xc000048708 pc=0x55c065b6eb85
net.(*TCPConn).Read(0x55c066283d90?, {0xc0000a06a1?, 0x0?, 0x0?})
        <autogenerated>:1 +0x25 fp=0xc000048780 sp=0xc000048750 pc=0x55c065b7bc25
net/http.(*connReader).backgroundRead(0xc0000a0690)
        net/http/server.go:690 +0x37 fp=0xc0000487c8 sp=0xc000048780 pc=0x55c065c9c977
net/http.(*connReader).startBackgroundRead.gowrap2()
        net/http/server.go:686 +0x25 fp=0xc0000487e0 sp=0xc0000487c8 pc=0x55c065c9c8a5
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc0000487e8 sp=0xc0000487e0 pc=0x55c065a9ef21
created by net/http.(*connReader).startBackgroundRead in goroutine 24
        net/http/server.go:686 +0xb6

goroutine 41 gp=0xc0002301c0 m=nil [GC worker (idle)]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc0001ae738 sp=0xc0001ae718 pc=0x55c065a972ee
runtime.gcBgMarkWorker(0xc00022a380)
        runtime/mgc.go:1412 +0xe9 fp=0xc0001ae7c8 sp=0xc0001ae738 pc=0x55c065a44bc9
runtime.gcBgMarkStartWorkers.gowrap1()
        runtime/mgc.go:1328 +0x25 fp=0xc0001ae7e0 sp=0xc0001ae7c8 pc=0x55c065a44aa5
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc0001ae7e8 sp=0xc0001ae7e0 pc=0x55c065a9ef21
created by runtime.gcBgMarkStartWorkers in goroutine 24
        runtime/mgc.go:1328 +0x105

goroutine 42 gp=0xc000230380 m=nil [GC worker (idle)]:
runtime.gopark(0x5207922bbd5d?, 0x1?, 0x36?, 0x1?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc0001aef38 sp=0xc0001aef18 pc=0x55c065a972ee
runtime.gcBgMarkWorker(0xc00022a380)
        runtime/mgc.go:1412 +0xe9 fp=0xc0001aefc8 sp=0xc0001aef38 pc=0x55c065a44bc9
runtime.gcBgMarkStartWorkers.gowrap1()
        runtime/mgc.go:1328 +0x25 fp=0xc0001aefe0 sp=0xc0001aefc8 pc=0x55c065a44aa5
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc0001aefe8 sp=0xc0001aefe0 pc=0x55c065a9ef21
created by runtime.gcBgMarkStartWorkers in goroutine 24
        runtime/mgc.go:1328 +0x105

goroutine 43 gp=0xc000230540 m=nil [GC worker (idle)]:
runtime.gopark(0x5207922c8bbc?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc0001af738 sp=0xc0001af718 pc=0x55c065a972ee
runtime.gcBgMarkWorker(0xc00022a380)
        runtime/mgc.go:1412 +0xe9 fp=0xc0001af7c8 sp=0xc0001af738 pc=0x55c065a44bc9
runtime.gcBgMarkStartWorkers.gowrap1()
        runtime/mgc.go:1328 +0x25 fp=0xc0001af7e0 sp=0xc0001af7c8 pc=0x55c065a44aa5
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc0001af7e8 sp=0xc0001af7e0 pc=0x55c065a9ef21
created by runtime.gcBgMarkStartWorkers in goroutine 24
        runtime/mgc.go:1328 +0x105

goroutine 44 gp=0xc000230700 m=nil [GC worker (idle)]:
runtime.gopark(0x5207922b5985?, 0x0?, 0x0?, 0x0?, 0x0?)
        runtime/proc.go:424 +0xce fp=0xc0001aff38 sp=0xc0001aff18 pc=0x55c065a972ee
runtime.gcBgMarkWorker(0xc00022a380)
        runtime/mgc.go:1412 +0xe9 fp=0xc0001affc8 sp=0xc0001aff38 pc=0x55c065a44bc9
runtime.gcBgMarkStartWorkers.gowrap1()
        runtime/mgc.go:1328 +0x25 fp=0xc0001affe0 sp=0xc0001affc8 pc=0x55c065a44aa5
runtime.goexit({})
        runtime/asm_amd64.s:1700 +0x1 fp=0xc0001affe8 sp=0xc0001affe0 pc=0x55c065a9ef21
created by runtime.gcBgMarkStartWorkers in goroutine 24
        runtime/mgc.go:1328 +0x105

rax    0x0
rbx    0xc1
rcx    0x7ff2b560eb1c
rdx    0x6
rdi    0xbd
rsi    0xc1
rbp    0x7ff26cf49eb0
rsp    0x7ff26cf49e70
r8     0x0
r9     0x0
r10    0x8
r11    0x246
r12    0x6
r13    0x640
r14    0x16
r15    0x4000000
rip    0x7ff2b560eb1c
rflags 0x246
cs     0x33
fs     0x0
gs     0x0
<!-- gh-comment-id:2654140978 --> @anton-b commented on GitHub (Feb 12, 2025): Having same issue: ``` ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA L40S, compute capability 8.9, VMM: yes time=2025-02-12T15:42:39.343Z level=INFO source=runner.go:937 msg=system info="CUDA : ARCHS = 600,610,620,700,720,750,800,860,870,890,900 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | LLAMAFILE = 1 | AARCH64_REPACK = 1 | cgo(gcc)" threads=2 time=2025-02-12T15:42:39.343Z level=INFO source=.:0 msg="Server listening on 127.0.0.1:38761" llama_load_model_from_file: using device CUDA0 (NVIDIA L40S) - 45055 MiB free llama_model_loader: loaded meta data with 27 key-value pairs and 396 tensors from /home/gpu_runtime/.ollama/models/blobs/sha256-7ef0839fb71fbab13fda97c1b9819ffd99c799ba4f93d421ae1e2a46d68c5fa6 (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = mllama llama_model_loader: - kv 1: general.type str = model llama_model_loader: - kv 2: general.name str = Model llama_model_loader: - kv 3: general.size_label str = 10B llama_model_loader: - kv 4: mllama.block_count u32 = 40 llama_model_loader: - kv 5: mllama.context_length u32 = 131072 llama_model_loader: - kv 6: mllama.embedding_length u32 = 4096 llama_model_loader: - kv 7: mllama.feed_forward_length u32 = 14336 llama_model_loader: - kv 8: mllama.attention.head_count u32 = 32 llama_model_loader: - kv 9: mllama.attention.head_count_kv u32 = 8 llama_model_loader: - kv 10: mllama.rope.freq_base f32 = 500000.000000 llama_model_loader: - kv 11: mllama.attention.layer_norm_rms_epsilon f32 = 0.000010 llama_model_loader: - kv 12: general.file_type u32 = 7 llama_model_loader: - kv 13: mllama.vocab_size u32 = 128256 llama_model_loader: - kv 14: mllama.rope.dimension_count u32 = 128 llama_model_loader: - kv 15: mllama.attention.cross_attention_layers arr[i32,8] = [3, 8, 13, 18, 23, 28, 33, 38] llama_model_loader: - kv 16: tokenizer.ggml.add_bos_token bool = true llama_model_loader: - kv 17: tokenizer.ggml.model str = gpt2 llama_model_loader: - kv 18: tokenizer.ggml.pre str = llama-bpe llama_model_loader: - kv 19: tokenizer.ggml.tokens arr[str,128257] = ["!", "\"", "#", "$", "%", "&", "'", ... llama_model_loader: - kv 20: tokenizer.ggml.token_type arr[i32,128257] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... time=2025-02-12T15:42:39.547Z level=INFO source=server.go:589 msg="waiting for server to become available" status="llm server loading model" llama_model_loader: - kv 21: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... llama_model_loader: - kv 22: tokenizer.ggml.bos_token_id u32 = 128000 llama_model_loader: - kv 23: tokenizer.ggml.eos_token_id u32 = 128009 llama_model_loader: - kv 24: tokenizer.ggml.padding_token_id u32 = 128004 llama_model_loader: - kv 25: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... llama_model_loader: - kv 26: general.quantization_version u32 = 2 llama_model_loader: - type f32: 114 tensors llama_model_loader: - type q8_0: 282 tensors llm_load_vocab: special tokens cache size = 257 llm_load_vocab: token to piece cache size = 0.7999 MB llm_load_print_meta: format = GGUF V3 (latest) llm_load_print_meta: arch = mllama llm_load_print_meta: vocab type = BPE llm_load_print_meta: n_vocab = 128256 llm_load_print_meta: n_merges = 280147 llm_load_print_meta: vocab_only = 0 llm_load_print_meta: n_ctx_train = 131072 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_layer = 40 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 8 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_swa = 0 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 4 llm_load_print_meta: n_embd_k_gqa = 1024 llm_load_print_meta: n_embd_v_gqa = 1024 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-05 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: f_logit_scale = 0.0e+00 llm_load_print_meta: n_ff = 14336 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: causal attn = 1 llm_load_print_meta: pooling type = 0 llm_load_print_meta: rope type = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 500000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_ctx_orig_yarn = 131072 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: ssm_d_conv = 0 llm_load_print_meta: ssm_d_inner = 0 llm_load_print_meta: ssm_d_state = 0 llm_load_print_meta: ssm_dt_rank = 0 llm_load_print_meta: ssm_dt_b_c_rms = 0 llm_load_print_meta: model type = 11B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 9.78 B llm_load_print_meta: model size = 9.67 GiB (8.50 BPW) llm_load_print_meta: general.name = Model llm_load_print_meta: BOS token = 128000 '<|begin_of_text|>' llm_load_print_meta: EOS token = 128009 '<|eot_id|>' llm_load_print_meta: EOT token = 128009 '<|eot_id|>' llm_load_print_meta: EOM token = 128008 '<|eom_id|>' llm_load_print_meta: PAD token = 128004 '<|finetune_right_pad_id|>' llm_load_print_meta: LF token = 128 'Ä' llm_load_print_meta: EOG token = 128008 '<|eom_id|>' llm_load_print_meta: EOG token = 128009 '<|eot_id|>' llm_load_print_meta: max token length = 256 llama_model_load: vocab mismatch 128256 !- 128257 ... [GIN] 2025/02/12 - 15:42:40 | 200 | 30.3µs | 127.0.0.1 | HEAD "/" [GIN] 2025/02/12 - 15:42:40 | 200 | 33.351µs | 127.0.0.1 | GET "/api/ps" llm_load_tensors: offloading 40 repeating layers to GPU llm_load_tensors: offloading output layer to GPU llm_load_tensors: offloaded 41/41 layers to GPU llm_load_tensors: CPU_Mapped model buffer size = 532.35 MiB llm_load_tensors: CUDA0 model buffer size = 9373.59 MiB llama_new_context_with_model: n_seq_max = 1 llama_new_context_with_model: n_ctx = 16384 llama_new_context_with_model: n_ctx_per_seq = 16384 llama_new_context_with_model: n_batch = 512 llama_new_context_with_model: n_ubatch = 512 llama_new_context_with_model: flash_attn = 0 llama_new_context_with_model: freq_base = 500000.0 llama_new_context_with_model: freq_scale = 1 llama_new_context_with_model: n_ctx_per_seq (16384) < n_ctx_train (131072) -- the full capacity of the model will not be utilized llama_kv_cache_init: kv_size = 16384, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 40, can_shift = 1 llama_kv_cache_init: CUDA0 KV buffer size = 2448.25 MiB llama_new_context_with_model: KV self size = 2448.25 MiB, K (f16): 1224.12 MiB, V (f16): 1224.12 MiB llama_new_context_with_model: CUDA_Host output buffer size = 0.50 MiB llama_new_context_with_model: CUDA0 compute buffer size = 1088.00 MiB llama_new_context_with_model: CUDA_Host compute buffer size = 40.01 MiB llama_new_context_with_model: graph nodes = 1262 llama_new_context_with_model: graph splits = 2 mllama_model_load: model name: Llama-3.2-11B-Vision-Instruct mllama_model_load: description: vision encoder for Mllama mllama_model_load: GGUF version: 3 mllama_model_load: alignment: 32 mllama_model_load: n_tensors: 512 mllama_model_load: n_kv: 17 mllama_model_load: ftype: f16 mllama_model_load: mllama_model_load: vision using CUDA backend [GIN] 2025/02/12 - 15:42:41 | 200 | 30.161µs | 127.0.0.1 | HEAD "/" [GIN] 2025/02/12 - 15:42:41 | 200 | 40.801µs | 127.0.0.1 | GET "/api/ps" mllama_model_load: compute allocated memory: 2853.34 MB time=2025-02-12T15:42:42.306Z level=INFO source=server.go:594 msg="llama runner started in 3.01 seconds" llama_model_loader: loaded meta data with 27 key-value pairs and 396 tensors from /home/gpu_runtime/.ollama/models/blobs/sha256-7ef0839fb71fbab13fda97c1b9819ffd99c799ba4f93d421ae1e2a46d68c5fa6 (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = mllama llama_model_loader: - kv 1: general.type str = model llama_model_loader: - kv 2: general.name str = Model llama_model_loader: - kv 3: general.size_label str = 10B llama_model_loader: - kv 4: mllama.block_count u32 = 40 llama_model_loader: - kv 5: mllama.context_length u32 = 131072 llama_model_loader: - kv 6: mllama.embedding_length u32 = 4096 llama_model_loader: - kv 7: mllama.feed_forward_length u32 = 14336 llama_model_loader: - kv 8: mllama.attention.head_count u32 = 32 llama_model_loader: - kv 9: mllama.attention.head_count_kv u32 = 8 llama_model_loader: - kv 10: mllama.rope.freq_base f32 = 500000.000000 llama_model_loader: - kv 11: mllama.attention.layer_norm_rms_epsilon f32 = 0.000010 llama_model_loader: - kv 12: general.file_type u32 = 7 llama_model_loader: - kv 13: mllama.vocab_size u32 = 128256 llama_model_loader: - kv 14: mllama.rope.dimension_count u32 = 128 llama_model_loader: - kv 15: mllama.attention.cross_attention_layers arr[i32,8] = [3, 8, 13, 18, 23, 28, 33, 38] llama_model_loader: - kv 16: tokenizer.ggml.add_bos_token bool = true llama_model_loader: - kv 17: tokenizer.ggml.model str = gpt2 llama_model_loader: - kv 18: tokenizer.ggml.pre str = llama-bpe llama_model_loader: - kv 19: tokenizer.ggml.tokens arr[str,128257] = ["!", "\"", "#", "$", "%", "&", "'", ... llama_model_loader: - kv 20: tokenizer.ggml.token_type arr[i32,128257] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... llama_model_loader: - kv 21: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "��Ġ ĠĠ", "... llama_model_loader: - kv 22: tokenizer.ggml.bos_token_id u32 = 128000 llama_model_loader: - kv 23: tokenizer.ggml.eos_token_id u32 = 128009 llama_model_loader: - kv 24: tokenizer.ggml.padding_token_id u32 = 128004 llama_model_loader: - kv 25: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... llama_model_loader: - kv 26: general.quantization_version u32 = 2 llama_model_loader: - type f32: 114 tensors llama_model_loader: - type q8_0: 282 tensors llm_load_vocab: special tokens cache size = 257 llm_load_vocab: token to piece cache size = 0.7999 MB llm_load_print_meta: format = GGUF V3 (latest) llm_load_print_meta: arch = mllama llm_load_print_meta: vocab type = BPE llm_load_print_meta: n_vocab = 128256 llm_load_print_meta: n_merges = 280147 llm_load_print_meta: vocab_only = 1 llm_load_print_meta: model type = ?B llm_load_print_meta: model ftype = all F32 llm_load_print_meta: model params = 9.78 B llm_load_print_meta: model size = 9.67 GiB (8.50 BPW) llm_load_print_meta: general.name = Model llm_load_print_meta: BOS token = 128000 '<|begin_of_text|>' llm_load_print_meta: EOS token = 128009 '<|eot_id|>' llm_load_print_meta: EOT token = 128009 '<|eot_id|>' llm_load_print_meta: EOM token = 128008 '<|eom_id|>' llm_load_print_meta: PAD token = 128004 '<|finetune_right_pad_id|>' llm_load_print_meta: LF token = 128 'Ä' llm_load_print_meta: EOG token = 128008 '<|eom_id|>' llm_load_print_meta: EOG token = 128009 '<|eot_id|>' llm_load_print_meta: max token length = 256 llama_model_load: vocab mismatch 128256 !- 128257 ... llama_model_load: vocab only - skipping tensors [GIN] 2025/02/12 - 15:42:49 | 200 | 16.104305385s | 192.168.114.106 | POST "/api/chat" time=2025-02-12T15:42:50.097Z level=WARN source=sched.go:137 msg="mllama doesn't support parallel requests yet" ggml.c:1600: GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)) failed SIGSEGV: segmentation violation PC=0x7ff2dadefe57 m=5 sigcode=1 addr=0x206803fcc signal arrived during cgo execution goroutine 19 gp=0xc00008c540 m=5 mp=0xc000088008 [syscall]: runtime.cgocall(0x55c065cdcfb0, 0xc000061ba0) runtime/cgocall.go:167 +0x4b fp=0xc000061b78 sp=0xc000061b40 pc=0x55c065a914eb github.com/ollama/ollama/llama._Cfunc_llama_decode(0x7ff258c39300, {0x1, 0x7ff258d88d00, 0x0, 0x0, 0x7ff258b81890, 0x7ff258b820a0, 0x7ff258b18630, 0x7ff259ae3550}) _cgo_gotypes.go:556 +0x4f fp=0xc000061ba0 sp=0xc000061b78 pc=0x55c065b3b56f github.com/ollama/ollama/llama.(*Context).Decode.func1(0x55c065cd86eb?, 0x7ff258c39300?) github.com/ollama/ollama/llama/llama.go:207 +0xf5 fp=0xc000061c90 sp=0xc000061ba0 pc=0x55c065b3dd95 github.com/ollama/ollama/llama.(*Context).Decode(0x55c066368320?, 0x0?) github.com/ollama/ollama/llama/llama.go:207 +0x13 fp=0xc000061cd8 sp=0xc000061c90 pc=0x55c065b3dc13 github.com/ollama/ollama/llama/runner.(*Server).processBatch(0xc0000ac000, 0xc0001ac0c0, 0xc0001ac120) github.com/ollama/ollama/llama/runner/runner.go:434 +0x23f fp=0xc000061ee0 sp=0xc000061cd8 pc=0x55c065cd74df github.com/ollama/ollama/llama/runner.(*Server).run(0xc0000ac000, {0x55c0660d80c0, 0xc0000aa050}) github.com/ollama/ollama/llama/runner/runner.go:342 +0x1d5 fp=0xc000061fb8 sp=0xc000061ee0 pc=0x55c065cd6f15 github.com/ollama/ollama/llama/runner.Execute.gowrap2() github.com/ollama/ollama/llama/runner/runner.go:975 +0x28 fp=0xc000061fe0 sp=0xc000061fb8 pc=0x55c065cdbe08 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc000061fe8 sp=0xc000061fe0 pc=0x55c065a9ef21 created by github.com/ollama/ollama/llama/runner.Execute in goroutine 1 github.com/ollama/ollama/llama/runner/runner.go:975 +0xde5 goroutine 1 gp=0xc0000061c0 m=nil [IO wait]: runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc00013b7b0 sp=0xc00013b790 pc=0x55c065a972ee runtime.netpollblock(0xc000031800?, 0x65a2fb46?, 0xc0?) runtime/netpoll.go:575 +0xf7 fp=0xc00013b7e8 sp=0xc00013b7b0 pc=0x55c065a5c057 internal/poll.runtime_pollWait(0x7ff2d39c4e00, 0x72) runtime/netpoll.go:351 +0x85 fp=0xc00013b808 sp=0xc00013b7e8 pc=0x55c065a965e5 internal/poll.(*pollDesc).wait(0xc0000a6100?, 0x900000036?, 0x0) internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00013b830 sp=0xc00013b808 pc=0x55c065aec427 internal/poll.(*pollDesc).waitRead(...) internal/poll/fd_poll_runtime.go:89 internal/poll.(*FD).Accept(0xc0000a6100) internal/poll/fd_unix.go:620 +0x295 fp=0xc00013b8d8 sp=0xc00013b830 pc=0x55c065aed995 net.(*netFD).accept(0xc0000a6100) net/fd_unix.go:172 +0x29 fp=0xc00013b990 sp=0xc00013b8d8 pc=0x55c065b66269 net.(*TCPListener).accept(0xc0000b0040) net/tcpsock_posix.go:159 +0x1e fp=0xc00013b9e0 sp=0xc00013b990 pc=0x55c065b768be net.(*TCPListener).Accept(0xc0000b0040) net/tcpsock.go:372 +0x30 fp=0xc00013ba10 sp=0xc00013b9e0 pc=0x55c065b75bf0 net/http.(*onceCloseListener).Accept(0xc0000ac2d0?) <autogenerated>:1 +0x24 fp=0xc00013ba28 sp=0xc00013ba10 pc=0x55c065cb47c4 net/http.(*Server).Serve(0xc00009e3c0, {0x55c0660d7ad8, 0xc0000b0040}) net/http/server.go:3330 +0x30c fp=0xc00013bb58 sp=0xc00013ba28 pc=0x55c065ca650c github.com/ollama/ollama/llama/runner.Execute({0xc000016130?, 0x55c065a9eb7c?, 0x0?}) github.com/ollama/ollama/llama/runner/runner.go:996 +0x11a9 fp=0xc00013bef8 sp=0xc00013bb58 pc=0x55c065cdbae9 main.main() github.com/ollama/ollama/cmd/runner/main.go:11 +0x54 fp=0xc00013bf50 sp=0xc00013bef8 pc=0x55c065cdca74 runtime.main() runtime/proc.go:272 +0x29d fp=0xc00013bfe0 sp=0xc00013bf50 pc=0x55c065a6363d runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00013bfe8 sp=0xc00013bfe0 pc=0x55c065a9ef21 goroutine 2 gp=0xc000006c40 m=nil [force gc (idle)]: runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc00004cfa8 sp=0xc00004cf88 pc=0x55c065a972ee runtime.goparkunlock(...) runtime/proc.go:430 runtime.forcegchelper() runtime/proc.go:337 +0xb8 fp=0xc00004cfe0 sp=0xc00004cfa8 pc=0x55c065a63978 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00004cfe8 sp=0xc00004cfe0 pc=0x55c065a9ef21 created by runtime.init.7 in goroutine 1 runtime/proc.go:325 +0x1a goroutine 3 gp=0xc000007180 m=nil [GC sweep wait]: runtime.gopark(0x1?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc00004d780 sp=0xc00004d760 pc=0x55c065a972ee runtime.goparkunlock(...) runtime/proc.go:430 runtime.bgsweep(0xc000078000) runtime/mgcsweep.go:317 +0xdf fp=0xc00004d7c8 sp=0xc00004d780 pc=0x55c065a4e1ff runtime.gcenable.gowrap1() runtime/mgc.go:204 +0x25 fp=0xc00004d7e0 sp=0xc00004d7c8 pc=0x55c065a42a65 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00004d7e8 sp=0xc00004d7e0 pc=0x55c065a9ef21 created by runtime.gcenable in goroutine 1 runtime/mgc.go:204 +0x66 goroutine 4 gp=0xc000007340 m=nil [GC scavenge wait]: runtime.gopark(0x6fa01?, 0xfc623?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc00004df78 sp=0xc00004df58 pc=0x55c065a972ee runtime.goparkunlock(...) runtime/proc.go:430 runtime.(*scavengerState).park(0x55c0662c3080) runtime/mgcscavenge.go:425 +0x49 fp=0xc00004dfa8 sp=0xc00004df78 pc=0x55c065a4bbe9 runtime.bgscavenge(0xc000078000) runtime/mgcscavenge.go:658 +0x59 fp=0xc00004dfc8 sp=0xc00004dfa8 pc=0x55c065a4c179 runtime.gcenable.gowrap2() runtime/mgc.go:205 +0x25 fp=0xc00004dfe0 sp=0xc00004dfc8 pc=0x55c065a42a05 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00004dfe8 sp=0xc00004dfe0 pc=0x55c065a9ef21 created by runtime.gcenable in goroutine 1 runtime/mgc.go:205 +0xa5 goroutine 5 gp=0xc000007c00 m=nil [finalizer wait]: runtime.gopark(0x0?, 0x55c0660d30d8?, 0x30?, 0x0?, 0x1000000010?) runtime/proc.go:424 +0xce fp=0xc00004c620 sp=0xc00004c600 pc=0x55c065a972ee runtime.runfinq() runtime/mfinal.go:193 +0x107 fp=0xc00004c7e0 sp=0xc00004c620 pc=0x55c065a41ae7 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00004c7e8 sp=0xc00004c7e0 pc=0x55c065a9ef21 created by runtime.createfing in goroutine 1 runtime/mfinal.go:163 +0x3d goroutine 6 gp=0xc000007dc0 m=nil [chan receive]: runtime.gopark(0xc00004e760?, 0x55c065b4d785?, 0x70?, 0xa2?, 0x55c0660dbda0?) runtime/proc.go:424 +0xce fp=0xc00004e718 sp=0xc00004e6f8 pc=0x55c065a972ee runtime.chanrecv(0xc0000201c0, 0x0, 0x1) runtime/chan.go:639 +0x41c fp=0xc00004e790 sp=0xc00004e718 pc=0x55c065a3273c runtime.chanrecv1(0x0?, 0x0?) runtime/chan.go:489 +0x12 fp=0xc00004e7b8 sp=0xc00004e790 pc=0x55c065a32312 runtime.unique_runtime_registerUniqueMapCleanup.func1(...) runtime/mgc.go:1781 runtime.unique_runtime_registerUniqueMapCleanup.gowrap1() runtime/mgc.go:1784 +0x2f fp=0xc00004e7e0 sp=0xc00004e7b8 pc=0x55c065a458cf runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00004e7e8 sp=0xc00004e7e0 pc=0x55c065a9ef21 created by unique.runtime_registerUniqueMapCleanup in goroutine 1 runtime/mgc.go:1779 +0x96 goroutine 24 gp=0xc00008cc40 m=nil [select]: runtime.gopark(0xc000139a68?, 0x2?, 0xd?, 0x77?, 0xc000139834?) runtime/proc.go:424 +0xce fp=0xc0001396a0 sp=0xc000139680 pc=0x55c065a972ee runtime.selectgo(0xc000139a68, 0xc000139830, 0xc0000b7100?, 0x0, 0x1?, 0x1) runtime/select.go:335 +0x7a5 fp=0xc0001397c8 sp=0xc0001396a0 pc=0x55c065a75545 github.com/ollama/ollama/llama/runner.(*Server).completion(0xc0000ac000, {0x55c0660d7c58, 0xc0001a20e0}, 0xc000134280) github.com/ollama/ollama/llama/runner/runner.go:687 +0xa86 fp=0xc000139ac0 sp=0xc0001397c8 pc=0x55c065cd9206 github.com/ollama/ollama/llama/runner.(*Server).completion-fm({0x55c0660d7c58?, 0xc0001a20e0?}, 0x55c065caa807?) <autogenerated>:1 +0x36 fp=0xc000139af0 sp=0xc000139ac0 pc=0x55c065cdc6b6 net/http.HandlerFunc.ServeHTTP(0xc0000c6000?, {0x55c0660d7c58?, 0xc0001a20e0?}, 0x0?) net/http/server.go:2220 +0x29 fp=0xc000139b18 sp=0xc000139af0 pc=0x55c065ca33c9 net/http.(*ServeMux).ServeHTTP(0x55c065a38f65?, {0x55c0660d7c58, 0xc0001a20e0}, 0xc000134280) net/http/server.go:2747 +0x1ca fp=0xc000139b68 sp=0xc000139b18 pc=0x55c065ca526a net/http.serverHandler.ServeHTTP({0x55c0660d6d10?}, {0x55c0660d7c58?, 0xc0001a20e0?}, 0x6?) net/http/server.go:3210 +0x8e fp=0xc000139b98 sp=0xc000139b68 pc=0x55c065cac16e net/http.(*conn).serve(0xc0000ac2d0, {0x55c0660d8088, 0xc0000a0300}) net/http/server.go:2092 +0x5d0 fp=0xc000139fb8 sp=0xc000139b98 pc=0x55c065ca1ff0 net/http.(*Server).Serve.gowrap3() net/http/server.go:3360 +0x28 fp=0xc000139fe0 sp=0xc000139fb8 pc=0x55c065ca6908 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc000139fe8 sp=0xc000139fe0 pc=0x55c065a9ef21 created by net/http.(*Server).Serve in goroutine 1 net/http/server.go:3360 +0x485 goroutine 57 gp=0xc0001a6000 m=nil [IO wait]: runtime.gopark(0x55c065a3d445?, 0x0?, 0x0?, 0x87?, 0xb?) runtime/proc.go:424 +0xce fp=0xc0000485a8 sp=0xc000048588 pc=0x55c065a972ee runtime.netpollblock(0x55c065ad2b18?, 0x65a2fb46?, 0xc0?) runtime/netpoll.go:575 +0xf7 fp=0xc0000485e0 sp=0xc0000485a8 pc=0x55c065a5c057 internal/poll.runtime_pollWait(0x7ff2d39c4ce8, 0x72) runtime/netpoll.go:351 +0x85 fp=0xc000048600 sp=0xc0000485e0 pc=0x55c065a965e5 internal/poll.(*pollDesc).wait(0xc0000a6200?, 0xc0000a06a1?, 0x0) internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc000048628 sp=0xc000048600 pc=0x55c065aec427 internal/poll.(*pollDesc).waitRead(...) internal/poll/fd_poll_runtime.go:89 internal/poll.(*FD).Read(0xc0000a6200, {0xc0000a06a1, 0x1, 0x1}) internal/poll/fd_unix.go:165 +0x27a fp=0xc0000486c0 sp=0xc000048628 pc=0x55c065aecf7a net.(*netFD).Read(0xc0000a6200, {0xc0000a06a1?, 0xc000048748?, 0x55c065a98990?}) net/fd_posix.go:55 +0x25 fp=0xc000048708 sp=0xc0000486c0 pc=0x55c065b65185 net.(*conn).Read(0xc0000a8050, {0xc0000a06a1?, 0x0?, 0x55c066368320?}) net/net.go:189 +0x45 fp=0xc000048750 sp=0xc000048708 pc=0x55c065b6eb85 net.(*TCPConn).Read(0x55c066283d90?, {0xc0000a06a1?, 0x0?, 0x0?}) <autogenerated>:1 +0x25 fp=0xc000048780 sp=0xc000048750 pc=0x55c065b7bc25 net/http.(*connReader).backgroundRead(0xc0000a0690) net/http/server.go:690 +0x37 fp=0xc0000487c8 sp=0xc000048780 pc=0x55c065c9c977 net/http.(*connReader).startBackgroundRead.gowrap2() net/http/server.go:686 +0x25 fp=0xc0000487e0 sp=0xc0000487c8 pc=0x55c065c9c8a5 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc0000487e8 sp=0xc0000487e0 pc=0x55c065a9ef21 created by net/http.(*connReader).startBackgroundRead in goroutine 24 net/http/server.go:686 +0xb6 goroutine 41 gp=0xc0002301c0 m=nil [GC worker (idle)]: runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc0001ae738 sp=0xc0001ae718 pc=0x55c065a972ee runtime.gcBgMarkWorker(0xc00022a380) runtime/mgc.go:1412 +0xe9 fp=0xc0001ae7c8 sp=0xc0001ae738 pc=0x55c065a44bc9 runtime.gcBgMarkStartWorkers.gowrap1() runtime/mgc.go:1328 +0x25 fp=0xc0001ae7e0 sp=0xc0001ae7c8 pc=0x55c065a44aa5 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc0001ae7e8 sp=0xc0001ae7e0 pc=0x55c065a9ef21 created by runtime.gcBgMarkStartWorkers in goroutine 24 runtime/mgc.go:1328 +0x105 goroutine 42 gp=0xc000230380 m=nil [GC worker (idle)]: runtime.gopark(0x5207922bbd5d?, 0x1?, 0x36?, 0x1?, 0x0?) runtime/proc.go:424 +0xce fp=0xc0001aef38 sp=0xc0001aef18 pc=0x55c065a972ee runtime.gcBgMarkWorker(0xc00022a380) runtime/mgc.go:1412 +0xe9 fp=0xc0001aefc8 sp=0xc0001aef38 pc=0x55c065a44bc9 runtime.gcBgMarkStartWorkers.gowrap1() runtime/mgc.go:1328 +0x25 fp=0xc0001aefe0 sp=0xc0001aefc8 pc=0x55c065a44aa5 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc0001aefe8 sp=0xc0001aefe0 pc=0x55c065a9ef21 created by runtime.gcBgMarkStartWorkers in goroutine 24 runtime/mgc.go:1328 +0x105 goroutine 43 gp=0xc000230540 m=nil [GC worker (idle)]: runtime.gopark(0x5207922c8bbc?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc0001af738 sp=0xc0001af718 pc=0x55c065a972ee runtime.gcBgMarkWorker(0xc00022a380) runtime/mgc.go:1412 +0xe9 fp=0xc0001af7c8 sp=0xc0001af738 pc=0x55c065a44bc9 runtime.gcBgMarkStartWorkers.gowrap1() runtime/mgc.go:1328 +0x25 fp=0xc0001af7e0 sp=0xc0001af7c8 pc=0x55c065a44aa5 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc0001af7e8 sp=0xc0001af7e0 pc=0x55c065a9ef21 created by runtime.gcBgMarkStartWorkers in goroutine 24 runtime/mgc.go:1328 +0x105 goroutine 44 gp=0xc000230700 m=nil [GC worker (idle)]: runtime.gopark(0x5207922b5985?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc0001aff38 sp=0xc0001aff18 pc=0x55c065a972ee runtime.gcBgMarkWorker(0xc00022a380) runtime/mgc.go:1412 +0xe9 fp=0xc0001affc8 sp=0xc0001aff38 pc=0x55c065a44bc9 runtime.gcBgMarkStartWorkers.gowrap1() runtime/mgc.go:1328 +0x25 fp=0xc0001affe0 sp=0xc0001affc8 pc=0x55c065a44aa5 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc0001affe8 sp=0xc0001affe0 pc=0x55c065a9ef21 created by runtime.gcBgMarkStartWorkers in goroutine 24 runtime/mgc.go:1328 +0x105 rax 0x206803fcc rbx 0x7ff25816e110 rcx 0xff3 rdx 0x7ff258006840 rdi 0x7ff258006850 rsi 0x0 rbp 0x7ff26cf49d50 rsp 0x7ff26cf49d30 r8 0x7ff25801f948 r9 0x0 r10 0x0 r11 0x246 r12 0x7ff26008e130 r13 0x7ff258006850 r14 0x0 r15 0x7ff32697de40 rip 0x7ff2dadefe57 rflags 0x10297 cs 0x33 fs 0x0 gs 0x0 SIGABRT: abort PC=0x7ff2b560eb1c m=5 sigcode=18446744073709551610 signal arrived during cgo execution goroutine 19 gp=0xc00008c540 m=5 mp=0xc000088008 [syscall]: runtime.cgocall(0x55c065cdcfb0, 0xc000061ba0) runtime/cgocall.go:167 +0x4b fp=0xc000061b78 sp=0xc000061b40 pc=0x55c065a914eb github.com/ollama/ollama/llama._Cfunc_llama_decode(0x7ff258c39300, {0x1, 0x7ff258d88d00, 0x0, 0x0, 0x7ff258b81890, 0x7ff258b820a0, 0x7ff258b18630, 0x7ff259ae3550}) _cgo_gotypes.go:556 +0x4f fp=0xc000061ba0 sp=0xc000061b78 pc=0x55c065b3b56f github.com/ollama/ollama/llama.(*Context).Decode.func1(0x55c065cd86eb?, 0x7ff258c39300?) github.com/ollama/ollama/llama/llama.go:207 +0xf5 fp=0xc000061c90 sp=0xc000061ba0 pc=0x55c065b3dd95 github.com/ollama/ollama/llama.(*Context).Decode(0x55c066368320?, 0x0?) github.com/ollama/ollama/llama/llama.go:207 +0x13 fp=0xc000061cd8 sp=0xc000061c90 pc=0x55c065b3dc13 github.com/ollama/ollama/llama/runner.(*Server).processBatch(0xc0000ac000, 0xc0001ac0c0, 0xc0001ac120) github.com/ollama/ollama/llama/runner/runner.go:434 +0x23f fp=0xc000061ee0 sp=0xc000061cd8 pc=0x55c065cd74df github.com/ollama/ollama/llama/runner.(*Server).run(0xc0000ac000, {0x55c0660d80c0, 0xc0000aa050}) github.com/ollama/ollama/llama/runner/runner.go:342 +0x1d5 fp=0xc000061fb8 sp=0xc000061ee0 pc=0x55c065cd6f15 github.com/ollama/ollama/llama/runner.Execute.gowrap2() github.com/ollama/ollama/llama/runner/runner.go:975 +0x28 fp=0xc000061fe0 sp=0xc000061fb8 pc=0x55c065cdbe08 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc000061fe8 sp=0xc000061fe0 pc=0x55c065a9ef21 created by github.com/ollama/ollama/llama/runner.Execute in goroutine 1 github.com/ollama/ollama/llama/runner/runner.go:975 +0xde5 goroutine 1 gp=0xc0000061c0 m=nil [IO wait]: runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc00013b7b0 sp=0xc00013b790 pc=0x55c065a972ee runtime.netpollblock(0xc000031800?, 0x65a2fb46?, 0xc0?) runtime/netpoll.go:575 +0xf7 fp=0xc00013b7e8 sp=0xc00013b7b0 pc=0x55c065a5c057 internal/poll.runtime_pollWait(0x7ff2d39c4e00, 0x72) runtime/netpoll.go:351 +0x85 fp=0xc00013b808 sp=0xc00013b7e8 pc=0x55c065a965e5 internal/poll.(*pollDesc).wait(0xc0000a6100?, 0x900000036?, 0x0) internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc00013b830 sp=0xc00013b808 pc=0x55c065aec427 internal/poll.(*pollDesc).waitRead(...) internal/poll/fd_poll_runtime.go:89 internal/poll.(*FD).Accept(0xc0000a6100) internal/poll/fd_unix.go:620 +0x295 fp=0xc00013b8d8 sp=0xc00013b830 pc=0x55c065aed995 net.(*netFD).accept(0xc0000a6100) net/fd_unix.go:172 +0x29 fp=0xc00013b990 sp=0xc00013b8d8 pc=0x55c065b66269 net.(*TCPListener).accept(0xc0000b0040) net/tcpsock_posix.go:159 +0x1e fp=0xc00013b9e0 sp=0xc00013b990 pc=0x55c065b768be net.(*TCPListener).Accept(0xc0000b0040) net/tcpsock.go:372 +0x30 fp=0xc00013ba10 sp=0xc00013b9e0 pc=0x55c065b75bf0 net/http.(*onceCloseListener).Accept(0xc0000ac2d0?) <autogenerated>:1 +0x24 fp=0xc00013ba28 sp=0xc00013ba10 pc=0x55c065cb47c4 net/http.(*Server).Serve(0xc00009e3c0, {0x55c0660d7ad8, 0xc0000b0040}) net/http/server.go:3330 +0x30c fp=0xc00013bb58 sp=0xc00013ba28 pc=0x55c065ca650c github.com/ollama/ollama/llama/runner.Execute({0xc000016130?, 0x55c065a9eb7c?, 0x0?}) github.com/ollama/ollama/llama/runner/runner.go:996 +0x11a9 fp=0xc00013bef8 sp=0xc00013bb58 pc=0x55c065cdbae9 main.main() github.com/ollama/ollama/cmd/runner/main.go:11 +0x54 fp=0xc00013bf50 sp=0xc00013bef8 pc=0x55c065cdca74 runtime.main() runtime/proc.go:272 +0x29d fp=0xc00013bfe0 sp=0xc00013bf50 pc=0x55c065a6363d runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00013bfe8 sp=0xc00013bfe0 pc=0x55c065a9ef21 goroutine 2 gp=0xc000006c40 m=nil [force gc (idle)]: runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc00004cfa8 sp=0xc00004cf88 pc=0x55c065a972ee runtime.goparkunlock(...) runtime/proc.go:430 runtime.forcegchelper() runtime/proc.go:337 +0xb8 fp=0xc00004cfe0 sp=0xc00004cfa8 pc=0x55c065a63978 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00004cfe8 sp=0xc00004cfe0 pc=0x55c065a9ef21 created by runtime.init.7 in goroutine 1 runtime/proc.go:325 +0x1a goroutine 3 gp=0xc000007180 m=nil [GC sweep wait]: runtime.gopark(0x1?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc00004d780 sp=0xc00004d760 pc=0x55c065a972ee runtime.goparkunlock(...) runtime/proc.go:430 runtime.bgsweep(0xc000078000) runtime/mgcsweep.go:317 +0xdf fp=0xc00004d7c8 sp=0xc00004d780 pc=0x55c065a4e1ff runtime.gcenable.gowrap1() runtime/mgc.go:204 +0x25 fp=0xc00004d7e0 sp=0xc00004d7c8 pc=0x55c065a42a65 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00004d7e8 sp=0xc00004d7e0 pc=0x55c065a9ef21 created by runtime.gcenable in goroutine 1 runtime/mgc.go:204 +0x66 goroutine 4 gp=0xc000007340 m=nil [GC scavenge wait]: runtime.gopark(0x6fa01?, 0xfc623?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc00004df78 sp=0xc00004df58 pc=0x55c065a972ee runtime.goparkunlock(...) runtime/proc.go:430 runtime.(*scavengerState).park(0x55c0662c3080) runtime/mgcscavenge.go:425 +0x49 fp=0xc00004dfa8 sp=0xc00004df78 pc=0x55c065a4bbe9 runtime.bgscavenge(0xc000078000) runtime/mgcscavenge.go:658 +0x59 fp=0xc00004dfc8 sp=0xc00004dfa8 pc=0x55c065a4c179 runtime.gcenable.gowrap2() runtime/mgc.go:205 +0x25 fp=0xc00004dfe0 sp=0xc00004dfc8 pc=0x55c065a42a05 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00004dfe8 sp=0xc00004dfe0 pc=0x55c065a9ef21 created by runtime.gcenable in goroutine 1 runtime/mgc.go:205 +0xa5 goroutine 5 gp=0xc000007c00 m=nil [finalizer wait]: runtime.gopark(0x0?, 0x55c0660d30d8?, 0x30?, 0x0?, 0x1000000010?) runtime/proc.go:424 +0xce fp=0xc00004c620 sp=0xc00004c600 pc=0x55c065a972ee runtime.runfinq() runtime/mfinal.go:193 +0x107 fp=0xc00004c7e0 sp=0xc00004c620 pc=0x55c065a41ae7 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00004c7e8 sp=0xc00004c7e0 pc=0x55c065a9ef21 created by runtime.createfing in goroutine 1 runtime/mfinal.go:163 +0x3d goroutine 6 gp=0xc000007dc0 m=nil [chan receive]: runtime.gopark(0xc00004e760?, 0x55c065b4d785?, 0x70?, 0xa2?, 0x55c0660dbda0?) runtime/proc.go:424 +0xce fp=0xc00004e718 sp=0xc00004e6f8 pc=0x55c065a972ee runtime.chanrecv(0xc0000201c0, 0x0, 0x1) runtime/chan.go:639 +0x41c fp=0xc00004e790 sp=0xc00004e718 pc=0x55c065a3273c runtime.chanrecv1(0x0?, 0x0?) runtime/chan.go:489 +0x12 fp=0xc00004e7b8 sp=0xc00004e790 pc=0x55c065a32312 runtime.unique_runtime_registerUniqueMapCleanup.func1(...) runtime/mgc.go:1781 runtime.unique_runtime_registerUniqueMapCleanup.gowrap1() runtime/mgc.go:1784 +0x2f fp=0xc00004e7e0 sp=0xc00004e7b8 pc=0x55c065a458cf runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc00004e7e8 sp=0xc00004e7e0 pc=0x55c065a9ef21 created by unique.runtime_registerUniqueMapCleanup in goroutine 1 runtime/mgc.go:1779 +0x96 goroutine 24 gp=0xc00008cc40 m=nil [select]: runtime.gopark(0xc000139a68?, 0x2?, 0xd?, 0x77?, 0xc000139834?) runtime/proc.go:424 +0xce fp=0xc0001396a0 sp=0xc000139680 pc=0x55c065a972ee runtime.selectgo(0xc000139a68, 0xc000139830, 0xc0000b7100?, 0x0, 0x1?, 0x1) runtime/select.go:335 +0x7a5 fp=0xc0001397c8 sp=0xc0001396a0 pc=0x55c065a75545 github.com/ollama/ollama/llama/runner.(*Server).completion(0xc0000ac000, {0x55c0660d7c58, 0xc0001a20e0}, 0xc000134280) github.com/ollama/ollama/llama/runner/runner.go:687 +0xa86 fp=0xc000139ac0 sp=0xc0001397c8 pc=0x55c065cd9206 github.com/ollama/ollama/llama/runner.(*Server).completion-fm({0x55c0660d7c58?, 0xc0001a20e0?}, 0x55c065caa807?) <autogenerated>:1 +0x36 fp=0xc000139af0 sp=0xc000139ac0 pc=0x55c065cdc6b6 net/http.HandlerFunc.ServeHTTP(0xc0000c6000?, {0x55c0660d7c58?, 0xc0001a20e0?}, 0x0?) net/http/server.go:2220 +0x29 fp=0xc000139b18 sp=0xc000139af0 pc=0x55c065ca33c9 net/http.(*ServeMux).ServeHTTP(0x55c065a38f65?, {0x55c0660d7c58, 0xc0001a20e0}, 0xc000134280) net/http/server.go:2747 +0x1ca fp=0xc000139b68 sp=0xc000139b18 pc=0x55c065ca526a net/http.serverHandler.ServeHTTP({0x55c0660d6d10?}, {0x55c0660d7c58?, 0xc0001a20e0?}, 0x6?) net/http/server.go:3210 +0x8e fp=0xc000139b98 sp=0xc000139b68 pc=0x55c065cac16e net/http.(*conn).serve(0xc0000ac2d0, {0x55c0660d8088, 0xc0000a0300}) net/http/server.go:2092 +0x5d0 fp=0xc000139fb8 sp=0xc000139b98 pc=0x55c065ca1ff0 net/http.(*Server).Serve.gowrap3() net/http/server.go:3360 +0x28 fp=0xc000139fe0 sp=0xc000139fb8 pc=0x55c065ca6908 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc000139fe8 sp=0xc000139fe0 pc=0x55c065a9ef21 created by net/http.(*Server).Serve in goroutine 1 net/http/server.go:3360 +0x485 goroutine 57 gp=0xc0001a6000 m=nil [IO wait]: runtime.gopark(0x55c065a3d445?, 0x0?, 0x0?, 0x87?, 0xb?) runtime/proc.go:424 +0xce fp=0xc0000485a8 sp=0xc000048588 pc=0x55c065a972ee runtime.netpollblock(0x55c065ad2b18?, 0x65a2fb46?, 0xc0?) runtime/netpoll.go:575 +0xf7 fp=0xc0000485e0 sp=0xc0000485a8 pc=0x55c065a5c057 internal/poll.runtime_pollWait(0x7ff2d39c4ce8, 0x72) runtime/netpoll.go:351 +0x85 fp=0xc000048600 sp=0xc0000485e0 pc=0x55c065a965e5 internal/poll.(*pollDesc).wait(0xc0000a6200?, 0xc0000a06a1?, 0x0) internal/poll/fd_poll_runtime.go:84 +0x27 fp=0xc000048628 sp=0xc000048600 pc=0x55c065aec427 internal/poll.(*pollDesc).waitRead(...) internal/poll/fd_poll_runtime.go:89 internal/poll.(*FD).Read(0xc0000a6200, {0xc0000a06a1, 0x1, 0x1}) internal/poll/fd_unix.go:165 +0x27a fp=0xc0000486c0 sp=0xc000048628 pc=0x55c065aecf7a net.(*netFD).Read(0xc0000a6200, {0xc0000a06a1?, 0xc000048748?, 0x55c065a98990?}) net/fd_posix.go:55 +0x25 fp=0xc000048708 sp=0xc0000486c0 pc=0x55c065b65185 net.(*conn).Read(0xc0000a8050, {0xc0000a06a1?, 0x0?, 0x55c066368320?}) net/net.go:189 +0x45 fp=0xc000048750 sp=0xc000048708 pc=0x55c065b6eb85 net.(*TCPConn).Read(0x55c066283d90?, {0xc0000a06a1?, 0x0?, 0x0?}) <autogenerated>:1 +0x25 fp=0xc000048780 sp=0xc000048750 pc=0x55c065b7bc25 net/http.(*connReader).backgroundRead(0xc0000a0690) net/http/server.go:690 +0x37 fp=0xc0000487c8 sp=0xc000048780 pc=0x55c065c9c977 net/http.(*connReader).startBackgroundRead.gowrap2() net/http/server.go:686 +0x25 fp=0xc0000487e0 sp=0xc0000487c8 pc=0x55c065c9c8a5 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc0000487e8 sp=0xc0000487e0 pc=0x55c065a9ef21 created by net/http.(*connReader).startBackgroundRead in goroutine 24 net/http/server.go:686 +0xb6 goroutine 41 gp=0xc0002301c0 m=nil [GC worker (idle)]: runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc0001ae738 sp=0xc0001ae718 pc=0x55c065a972ee runtime.gcBgMarkWorker(0xc00022a380) runtime/mgc.go:1412 +0xe9 fp=0xc0001ae7c8 sp=0xc0001ae738 pc=0x55c065a44bc9 runtime.gcBgMarkStartWorkers.gowrap1() runtime/mgc.go:1328 +0x25 fp=0xc0001ae7e0 sp=0xc0001ae7c8 pc=0x55c065a44aa5 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc0001ae7e8 sp=0xc0001ae7e0 pc=0x55c065a9ef21 created by runtime.gcBgMarkStartWorkers in goroutine 24 runtime/mgc.go:1328 +0x105 goroutine 42 gp=0xc000230380 m=nil [GC worker (idle)]: runtime.gopark(0x5207922bbd5d?, 0x1?, 0x36?, 0x1?, 0x0?) runtime/proc.go:424 +0xce fp=0xc0001aef38 sp=0xc0001aef18 pc=0x55c065a972ee runtime.gcBgMarkWorker(0xc00022a380) runtime/mgc.go:1412 +0xe9 fp=0xc0001aefc8 sp=0xc0001aef38 pc=0x55c065a44bc9 runtime.gcBgMarkStartWorkers.gowrap1() runtime/mgc.go:1328 +0x25 fp=0xc0001aefe0 sp=0xc0001aefc8 pc=0x55c065a44aa5 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc0001aefe8 sp=0xc0001aefe0 pc=0x55c065a9ef21 created by runtime.gcBgMarkStartWorkers in goroutine 24 runtime/mgc.go:1328 +0x105 goroutine 43 gp=0xc000230540 m=nil [GC worker (idle)]: runtime.gopark(0x5207922c8bbc?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc0001af738 sp=0xc0001af718 pc=0x55c065a972ee runtime.gcBgMarkWorker(0xc00022a380) runtime/mgc.go:1412 +0xe9 fp=0xc0001af7c8 sp=0xc0001af738 pc=0x55c065a44bc9 runtime.gcBgMarkStartWorkers.gowrap1() runtime/mgc.go:1328 +0x25 fp=0xc0001af7e0 sp=0xc0001af7c8 pc=0x55c065a44aa5 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc0001af7e8 sp=0xc0001af7e0 pc=0x55c065a9ef21 created by runtime.gcBgMarkStartWorkers in goroutine 24 runtime/mgc.go:1328 +0x105 goroutine 44 gp=0xc000230700 m=nil [GC worker (idle)]: runtime.gopark(0x5207922b5985?, 0x0?, 0x0?, 0x0?, 0x0?) runtime/proc.go:424 +0xce fp=0xc0001aff38 sp=0xc0001aff18 pc=0x55c065a972ee runtime.gcBgMarkWorker(0xc00022a380) runtime/mgc.go:1412 +0xe9 fp=0xc0001affc8 sp=0xc0001aff38 pc=0x55c065a44bc9 runtime.gcBgMarkStartWorkers.gowrap1() runtime/mgc.go:1328 +0x25 fp=0xc0001affe0 sp=0xc0001affc8 pc=0x55c065a44aa5 runtime.goexit({}) runtime/asm_amd64.s:1700 +0x1 fp=0xc0001affe8 sp=0xc0001affe0 pc=0x55c065a9ef21 created by runtime.gcBgMarkStartWorkers in goroutine 24 runtime/mgc.go:1328 +0x105 rax 0x0 rbx 0xc1 rcx 0x7ff2b560eb1c rdx 0x6 rdi 0xbd rsi 0xc1 rbp 0x7ff26cf49eb0 rsp 0x7ff26cf49e70 r8 0x0 r9 0x0 r10 0x8 r11 0x246 r12 0x6 r13 0x640 r14 0x16 r15 0x4000000 rip 0x7ff2b560eb1c rflags 0x246 cs 0x33 fs 0x0 gs 0x0 ```
Author
Owner

@rick-github commented on GitHub (Feb 12, 2025):

Version of ollama? Prompt?

<!-- gh-comment-id:2654229529 --> @rick-github commented on GitHub (Feb 12, 2025): Version of ollama? Prompt?
Author
Owner

@anton-b commented on GitHub (Feb 12, 2025):

Version of ollama? Prompt?

ollama --version
ollama version is 0.5.7

prompt contains NDA, can't share, but once i restart ollama and pass the same prompt it is not crashing this way.

<!-- gh-comment-id:2654348681 --> @anton-b commented on GitHub (Feb 12, 2025): > Version of ollama? Prompt? ollama --version ollama version is 0.5.7 prompt contains NDA, can't share, but once i restart ollama and pass the same prompt it is not crashing this way.
Author
Owner

@anton-b commented on GitHub (Feb 12, 2025):

kk just happened with me trying llama3.2-vision:11b-instruct-fp16 in the same setup

Also i am running in the EKS environment.

free -h
total used free shared buff/cache available
Mem: 30Gi 1.8Gi 746Mi 3.6Mi 28Gi 29Gi
Swap: 0B 0B 0B

Tasks: 4 total, 1 running, 3 sleeping, 0 stopped, 0 zombie
%Cpu(s): 0.4 us, 0.2 sy, 0.0 ni, 99.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
MiB Mem : 31631.2 total, 751.5 free, 1843.8 used, 29490.3 buff/cache
MiB Swap: 0.0 total, 0.0 free, 0.0 used. 29787.4 avail Mem

PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
  1 navi_gp+  20   0 7570188 145204  27032 S   0.0   0.4   2:01.60 ollama
187 navi_gp+  20   0    2804   1104   1008 S   0.0   0.0   0:00.00 sh
193 navi_gp+  20   0    4592   3920   3276 S   0.0   0.0   0:00.01 bash
214 navi_gp+  20   0    8872   5084   2964 R   0.0   0.0   0:00.00 top

ollama ps
NAME ID SIZE PROCESSOR UNTIL
llama3.2-vision-tool:latest f39fcb230724 28 GB 100% GPU Forever

nvidia-smi
Wed Feb 12 17:22:35 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L40S                    On  |   00000000:30:00.0 Off |                    0 |
| N/A   22C    P8             31W /  350W |       4MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

<!-- gh-comment-id:2654385578 --> @anton-b commented on GitHub (Feb 12, 2025): kk just happened with me trying llama3.2-vision:11b-instruct-fp16 in the same setup Also i am running in the EKS environment. free -h total used free shared buff/cache available Mem: 30Gi 1.8Gi 746Mi 3.6Mi 28Gi 29Gi Swap: 0B 0B 0B Tasks: 4 total, 1 running, 3 sleeping, 0 stopped, 0 zombie %Cpu(s): 0.4 us, 0.2 sy, 0.0 ni, 99.3 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st MiB Mem : 31631.2 total, 751.5 free, 1843.8 used, 29490.3 buff/cache MiB Swap: 0.0 total, 0.0 free, 0.0 used. 29787.4 avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 1 navi_gp+ 20 0 7570188 145204 27032 S 0.0 0.4 2:01.60 ollama 187 navi_gp+ 20 0 2804 1104 1008 S 0.0 0.0 0:00.00 sh 193 navi_gp+ 20 0 4592 3920 3276 S 0.0 0.0 0:00.01 bash 214 navi_gp+ 20 0 8872 5084 2964 R 0.0 0.0 0:00.00 top ollama ps NAME ID SIZE PROCESSOR UNTIL llama3.2-vision-tool:latest f39fcb230724 28 GB 100% GPU Forever ``` nvidia-smi Wed Feb 12 17:22:35 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 560.35.05 Driver Version: 560.35.05 CUDA Version: 12.6 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:30:00.0 Off | 0 | | N/A 22C P8 31W / 350W | 4MiB / 46068MiB | 0% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | No running processes found | +-----------------------------------------------------------------------------------------+ ```
Author
Owner

@bitsydarel commented on GitHub (May 25, 2025):

Same issue on the latest ollama version:

ResponseError('an error was encountered while running the model: GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)) failed')Traceback (most recent call last):


  File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/langchain_core/language_models/chat_models.py", line 1084, in _agenerate_with_cache
    result = await self._agenerate(
             ^^^^^^^^^^^^^^^^^^^^^^
        messages, stop=stop, run_manager=run_manager, **kwargs
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^


  File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/langchain_ollama/chat_models.py", line 847, in _agenerate
    final_chunk = await self._achat_stream_with_aggregation(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        messages, stop, run_manager, verbose=self.verbose, **kwargs
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^


  File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/langchain_ollama/chat_models.py", line 677, in _achat_stream_with_aggregation
    async for chunk in self._aiterate_over_stream(messages, stop, **kwargs):
    ...<9 lines>...
            )


  File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/langchain_ollama/chat_models.py", line 792, in _aiterate_over_stream
    async for stream_resp in self._acreate_chat_stream(messages, stop, **kwargs):
    ...<30 lines>...
            yield chunk


  File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/langchain_ollama/chat_models.py", line 625, in _acreate_chat_stream
    async for part in await self._async_client.chat(**chat_params):
        yield part


  File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/ollama/_client.py", line 677, in inner
    raise ResponseError(err)


ollama._types.ResponseError: an error was encountered while running the model: GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)) failed (status code: -1)

Prompt:

You are DAIFY, a highly specialized construction AI assistant.
Carefully analyze the user input and, step by step, generate a concise summary and extract key vocabulary or glossary terms relevant to construction projects, clearly explaining your reasoning for each identified term.

Create a precise summary (maximum 5000 characters) of the provided text in french.
For each identified term, your goal is to identify other FRENCH terms, concepts, or specifiers that could contextually substitute, detail, or elaborate on the original term *ONLY within the construction domain* from the provided text.
Think of it like breaking down a general construction term into its more specific components, materials, types, or related essential characteristics.

**Respond ONLY in JSON format enclosed in backticks with the following format:**
```json
{
  "$defs": {
    "_GlossaryEntry": {
      "description": "Represents a glossary entry with a term and its list of substituting or specifying terms.",
      "properties": {
        "term": {
          "description": "Vocabulary term",
          "title": "Term",
          "type": "string"
        },
        "contextual_specifiers": {
          "description": "List of substituting, specifying, or elaborating construction-related terms or concepts",
          "items": {
            "type": "string"
          },
          "title": "Contextual Specifiers",
          "type": "array"
        }
      },
      "required": [
        "term",
        "contextual_specifiers"
      ],
      "title": "_GlossaryEntry",
      "type": "object"
    }
  },
  "description": "Response model for the Summarization Agent.",
  "properties": {
    "summary": {
      "description": "Generated summary of the document",
      "title": "Summary",
      "type": "string"
    },
    "glossary": {
      "default": "Generated list of glossary entries",
      "items": {
        "$ref": "#/$defs/_GlossaryEntry"
      },
      "title": "Glossary",
      "type": "array"
    }
  },
  "required": [
    "summary"
  ],
  "title": "_SummaryAgentResponse",
  "type": "object"
}


Langchain metadata:

```json
{"checkpoint_ns":"summarize_agent:8b8ae09b-1ca5-d022-00c6-55b41292404a","langgraph_checkpoint_ns":"summarize_agent:8b8ae09b-1ca5-d022-00c6-55b41292404a|summarize_image:11444a66-cd7a-4545-696b-7010af02d6d8","langgraph_node":"summarize_image","langgraph_path":["__pregel_pull","summarize_image"],"langgraph_step":2,"langgraph_triggers":["branch:to:summarize_image"],"ls_model_name":"gemma3:4b-it-q8_0","ls_model_type":"chat","ls_provider":"ollama","ls_temperature":0,"revision_id":"4ab1bea-dirty","thread_id":"chunk_465","ls_run_depth":6}
<!-- gh-comment-id:2907781343 --> @bitsydarel commented on GitHub (May 25, 2025): Same issue on the latest ollama version: ``` ResponseError('an error was encountered while running the model: GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)) failed')Traceback (most recent call last): File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/langchain_core/language_models/chat_models.py", line 1084, in _agenerate_with_cache result = await self._agenerate( ^^^^^^^^^^^^^^^^^^^^^^ messages, stop=stop, run_manager=run_manager, **kwargs ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/langchain_ollama/chat_models.py", line 847, in _agenerate final_chunk = await self._achat_stream_with_aggregation( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ messages, stop, run_manager, verbose=self.verbose, **kwargs ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/langchain_ollama/chat_models.py", line 677, in _achat_stream_with_aggregation async for chunk in self._aiterate_over_stream(messages, stop, **kwargs): ...<9 lines>... ) File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/langchain_ollama/chat_models.py", line 792, in _aiterate_over_stream async for stream_resp in self._acreate_chat_stream(messages, stop, **kwargs): ...<30 lines>... yield chunk File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/langchain_ollama/chat_models.py", line 625, in _acreate_chat_stream async for part in await self._async_client.chat(**chat_params): yield part File "/Users/darelbitsy/PycharmProjects/btp_pdf_rag/.venv/lib/python3.13/site-packages/ollama/_client.py", line 677, in inner raise ResponseError(err) ollama._types.ResponseError: an error was encountered while running the model: GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)) failed (status code: -1) ``` Prompt: ``` You are DAIFY, a highly specialized construction AI assistant. Carefully analyze the user input and, step by step, generate a concise summary and extract key vocabulary or glossary terms relevant to construction projects, clearly explaining your reasoning for each identified term. Create a precise summary (maximum 5000 characters) of the provided text in french. For each identified term, your goal is to identify other FRENCH terms, concepts, or specifiers that could contextually substitute, detail, or elaborate on the original term *ONLY within the construction domain* from the provided text. Think of it like breaking down a general construction term into its more specific components, materials, types, or related essential characteristics. **Respond ONLY in JSON format enclosed in backticks with the following format:** ```json { "$defs": { "_GlossaryEntry": { "description": "Represents a glossary entry with a term and its list of substituting or specifying terms.", "properties": { "term": { "description": "Vocabulary term", "title": "Term", "type": "string" }, "contextual_specifiers": { "description": "List of substituting, specifying, or elaborating construction-related terms or concepts", "items": { "type": "string" }, "title": "Contextual Specifiers", "type": "array" } }, "required": [ "term", "contextual_specifiers" ], "title": "_GlossaryEntry", "type": "object" } }, "description": "Response model for the Summarization Agent.", "properties": { "summary": { "description": "Generated summary of the document", "title": "Summary", "type": "string" }, "glossary": { "default": "Generated list of glossary entries", "items": { "$ref": "#/$defs/_GlossaryEntry" }, "title": "Glossary", "type": "array" } }, "required": [ "summary" ], "title": "_SummaryAgentResponse", "type": "object" } ``` ``` Langchain metadata: ```json {"checkpoint_ns":"summarize_agent:8b8ae09b-1ca5-d022-00c6-55b41292404a","langgraph_checkpoint_ns":"summarize_agent:8b8ae09b-1ca5-d022-00c6-55b41292404a|summarize_image:11444a66-cd7a-4545-696b-7010af02d6d8","langgraph_node":"summarize_image","langgraph_path":["__pregel_pull","summarize_image"],"langgraph_step":2,"langgraph_triggers":["branch:to:summarize_image"],"ls_model_name":"gemma3:4b-it-q8_0","ls_model_type":"chat","ls_provider":"ollama","ls_temperature":0,"revision_id":"4ab1bea-dirty","thread_id":"chunk_465","ls_run_depth":6} ```
Sign in to join this conversation.
1 Participants
Notifications
Due Date
No due date set.
Dependencies

No dependencies set.

Reference: github-starred/ollama#4947