From 7dfffdbad878fe77a9c9d179a279ca5bb3164799 Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Sun, 22 Feb 2026 19:34:36 +0100 Subject: [PATCH] Lip sync v2: text persistence across TTS chunks, audio pre-buffering, smoothing fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix text erasure between TTS audio chunks (bFullTextReceived guard): partial text now persists across all chunks of the same utterance instead of being erased after chunk 1's queue empties - Add audio pre-buffering (AudioPreBufferMs, default 250ms) to absorb TTS inter-chunk gaps and eliminate mid-sentence audio pauses - Lip sync pauses viseme queue consumption during pre-buffer to stay in sync - Inter-frame interpolation (lerp between consumed and next queued frame) for smoother mouth transitions instead of 32ms step-wise jumps - Reduce double-smoothing (blendshape smooth 0.8→0.4, release 0.5→0.65) - Adjust duration weights (vowels 2.0/1.7, plosives 0.8, silence 1.0) - UI range refinement (AmplitudeScale 0.5-1.0, SmoothingSpeed 35-65) - Silence padding capped at 512 samples (32ms) to prevent buffer accumulation - Audio playback restart on buffer underrun during speech - Optimized log levels (most debug→Verbose, kept key diagnostics at Log) Co-Authored-By: Claude Opus 4.6 --- .../PS_AI_Agent/Content/Demo_VoiceOnly.umap | Bin 59573 -> 59461 bytes .../Content/MetaHumans/Taro/BP_Taro.uasset | Bin 531528 -> 531485 bytes ...ElevenLabsConversationalAgentComponent.cpp | 89 +- .../Private/ElevenLabsLipSyncComponent.cpp | 1267 ++++++++++++++--- .../ElevenLabsConversationalAgentComponent.h | 24 + .../Public/ElevenLabsLipSyncComponent.h | 74 +- 6 files changed, 1270 insertions(+), 184 deletions(-) diff --git a/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap b/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap index 1fe726f8184d959937a914ac20c8fd550ef6207b..b00a0021bbf7a8852df9d1b0174c47b25c722b69 100644 GIT binary patch delta 959 zcmdmbk@@Ha<_+5z1q@UV!#=eS#RiNeK@ca4*eF&4-QzFAJ1`k6eLgUZ$#g21X&)5Ev~4=vteZhPOowNJx&D4elLLcQn1W}4 zx$*HvlMe=~u%^rcnm765aq-D(TzMw*hIlY7gD3#GY-WfKli?h=dcjZ?rW*)uBapjz zE?9PdXc*H|2va{SjOpV%xSoq)DojlC;ataX6(-}wU@q9};W|v)mVmkM!^4=2mq8Rp zgfa21fGheLp~IB863!La#4))rQibW#D!9FB0nV)iax*uA6&{QUVcN9?%ruMzGIxR%&5sRZ z+PDiWDi{~S#JwA;m2L8$;~bNF<2;y{4uOT<$AvMmyZ|vb$Hy-gkS}*G2y!$rEb|X_ zgn{z1$V~Ge4zNG^wH8L<5?Csd?VD__iGjCHd@8k*9tdj-$%r@_uWW@;O$ZqzZ zxRePl<2?l;vw7LnpDaR+C!Q+2d&A>5r{v_HC&9u|GK|2`5}tgaig$8B6$^;o{Akus jCINAQz3)x?E8G&5?nxcJz<7T1>$wvYaJgjSJ{2th3$IuH delta 899 zcmX?lfqCmi<_+5z1s3!KcKzU9;jmnBXHulLfz{+=j0LidFBnk3e;^R3VqnN;WMF9B z9L%KU%Q$gzoqs;pM4$i&Ea;!S&{bk`e4x|hzyJ{@!--&SzOUBgl>s_Tw-8*(Koz*A z=1EXZesCd?DPW=VfniKbr-GUGL19d4GvLPE4ANmTo(1I^O%4oJVLA-q&JQ!1d@xvr z^~x-udnPv?7oWVwm1i<X*8KJ zQibW_N-)HT`Rdd?UfrtKTRT>qFbrkfkViVntvFllWCGYw;b%w1pw^JBx96nBF~1>-`Ps&|7$ z>*IjFI|OFF2Qo`ufS8-(;};7|-j~HTSt3J>+0or^vSEg1eUPss6gat-!?^!r=m_R)K3R~&!D6guY&KcH z(wxOW&&YIgR;3B68Hlxcab*a{WV!XCj0~H3n~HfkLFx1^FcqhQy(Z9Swt08I6(feQ z?BoLzS8hHq@eihy+2-$)(RFM#oBESQKrYg=-RY;C^F}l8YZcdTdT+L$rNb;BliF{W c$5AeEUE$nn+so@7Zmym?K>_L{#eFJT00%WeApigX diff --git a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset index 27f231c2644e7ecac1ccd862a93923fac51bb05b..432938fbc82222e23f57af7e7b7a06e193e4da12 100644 GIT binary patch delta 10523 zcmZXa30zFw8^`a=nf3@xjH04R+wj^Vp?a5em6|qNmXM`H3oY7A^d|45qN#@VeUVC( zXhEVyUTveOObA7irG+;Cb6@vP_v3#*hIzif=bU@abC&y@Yie(*CfroLDodfr@upGE z(CQ* zbbSZa(E&i2zkngG&g(mX0xxJ#IAZhpmJBgT-j0JPNIwAOUV_v^V(Td?FpszO5DP5! zCV?6VtWJ=?YDX5B=|cip5Ii8jm?H|JeMxGF6ANtdCxH%M+O5BNQR5tSMuzS(LpK!;*Oz!eSxGJ9;bshWKaOXmS~cPiU%DW zEhK>m2(A#I-U9`-H%O|hCmr0rMFL$Ae7{QqY%e;{tHeR57Ye)yqr(mM-v7Nr$FanR1*SKWfQBy% zJZZs!pDzmj5r~YHJQY7WSpNhEO6Bu;C;W6k>oZdM9zPav65?R3KMEFhiwr$^0GA=Q z7Mo~4*W=e7l87;~c>e=Q6#GSt{!9|Zaq$*j?ZIJObUuj!l`)YgllS8!6MXuC1D(L7 zJUkU$6oc9uBk2u6XNEvYVI)SlsOyuTF}>;KV{ zpN-#WI)0;s9i)0qLRcX80I4cP+&dnxYL62x%A7TlA?hDLLaKiOUShKZ`~t5hZX|Ib z1XwSLlG-Ooq8ct%;aR4VJH}?!VPRz?g(dq^>rjr4?*T*5?C9- z05b(7kPCtDGaU3qpuoA4q=uenfUy@i$h|N%?`Zfy9Djq0SZkdxFo1Wj$QuQ{{USq8 zo>+>N#29`d6)lTofCa-OZ)|w5&k1}Ke8+GR>t`H~0lxgeK?e^7Wi;`rrRW#M0Dh7< zz})*F=#a%hT=dj_QjAxAex^u7f$?mSp(jsl5ox&Rs3I<6Bp9$_SCIS14OyIMe1R5Y%vYG@8FEK&)S{#I4LV=p0$YAnR zVwu3<4;;Y8@f$nEX}r)_O>qCODasp(Wq~~?4$R`<{4*69Q*m*^b`W^&u^@?91h5gY z!o@mEl1L0LA0J#r8J4HpAJWe7LY&$UJ@v~cf ziJD;aFizJdp@8EiGM4f>k{E#IMFM8Y43Od{0x)ApW`aehNI)fp34Z?@2hJ%dcyR_d zuvU$vuz-Fj2^>mgfs_aw=%h{UxG8w|jd>yw1#e?ShMqhT#YjAgr(qGZ`TH3dy4(lL-{B z<3K431+$AuYC#qgh~LD)T7NjJvYFuBEgaNWGIYyP?h@{>{jNDokbfTs zGP!W&Al4lwF zLY(n~BoYggxQ8u80zbZ7`~TKm1tJjzgJL3MYTF?0p(jZai3v}L{?a7z*Vf&Y#l_qr z6wu^Ip4elY9AY^Bh>WQ~65-gUND_Z-YSBEBNCcAT zLd|Ml*kd#)5OL4D@%DwrEtz_lRzj0cUdGb z1_A9i5^yeK0CX7+uqD$_#sue9;6SMy1q<|X16!zBE1i}jUopSaBc8>~omncQK44K_HCxjQw-T=5+?af?Vq!8AKj zT~qUR0>4q+PFxJFnwlvBT&%Vyl_X{g(Ztw&A`t~!4v?xL7Cqr5{aowh`ar9vFrhS z7;{LT*kr>#glFq1l1NNdLX7;IBob4V5G_J*(XR#t(p-@-H69Xptg6r9Vn@YNUT7@? z{2M{?CVEYH{}YLe&h=B{PTcf-G%n`Wp%~aA)h;QmhA~6+-UZ*?~Rf~el4J6NBr{ck8l1TJA318C-pWq^v68&gqg32}# zK*64N+`z`Ot%U_HzaRk<0SjDti-WSmOx_&<6YxLaK;{Vwc77qb^Pj-?Am1dg*L>|K;xD-*N=N#Z3S%+Xp|AXJP5dLSq!z}hwx91zDewIYmN|VIt5Mog!#Dy{>QSKK}N|q$f_(l9gh&ml8 zXq!gzob`+6?dc?u*g{qDj%CQl_^Xw=cYa`vL<8I%zOZe!&DE2+;8Y z1=r@1RM(eG;HpRhT@bL8roeUftNrqniR(o}$Eh5B&5UGPcGrMnOe&|`ky2!3jBHeS zZfF;<+^~$DOueM>JgbE{Rz#Ukw=$AWI$>KpE8wMdBh|`mqtT2eV>)7+Gh%FPo%NiK zyw^+(RO1cUI5h_h+>0|3Gcc_9W~23aqT6Ve6?AMH%S_F&|ES0fl!lHP>DePCc}g9e z)_&dKS+26yHpE) zaoU#!G7vLkW2g!P94Y(9x;VkB64Tu3ZEnCD8e2pZyktMU9d&NixdQP(m7jjXB4h+M zJu8Rh=KIsl*sL%>6S`3pU`)hJliX}KYX?*sO;6=&Qnd_5UYciZ9JwQX6ruNywm3CS z4BQK3rgBXYld;Sc7=T!A;OwHJi6uL3KFXne;q><$aqrQHmutRm)S&R*cY4~i=#Z$K z1!MY4K2TaR14_3n?9SD z`$k80rCxZ`eNst@-#1iaVeV7$q^!NOjvgdC-Ti}4s@ogMhp%;&0=lhjkB)>Lw2ZR0 zZlqn9;B-|dZHo998-UQC8T9MN{y@AxCrkFGa*<;&s)CIp!DdEcu$!`9YI4%*l?O*P zDT&)g(mQQmc;#qoj8zB9EgiG|9@2iv7+E*1dl4VHn2b0Hkmo^y=9$+ouroJ~a1;?X zbn*#lk26N}mN^8OUG;0R8IX<)RGa}g!V38+-?UDj%X`HX)vetVee$J+LbFRTnwDiH zMmovd=591mv;WQ3=-N7+^KylQc{!^t#rGF>bnaFt{?h1HCCzoSE6=s6aPD|_=xH|N zQ`*&e(t{IfW*!Eff@-I;ea}vwzQvyLO>3}zg}1X_!W;ufHK*Oa;hfQ-y#4Ksk2&&O zuOqKlCmps-Im7mK(Ka4Sy*2&RDmh>H$CHk9et2=RC~@y_$|M}tWC_^OsQKfH;YljO zWCv@=!~-eK@V&K)^wIC*O^>)(qU z?o?RYt@<-M*2GvWI4m`|h^^(2evSqjAV@Imb7DVQ@X^4+mj}eWzrgF5s?QgQYhoRY?$6~rQvP#gLISS zYr4J1)VjXgY%JNoDLpm6-e7i1?ZMIulAY4k=y=26_wt{@Ps?&jXgU@J39o=3lZ~T39njOYl~Inxf@Cr7=#J^SYFb z?iAg}@9r%8EA863*2X$cr5g7Q+iZTG=gL3^b2YNrFJ|9GtJuHZtaxX$OkCB`WsJII zO^3|z^)|&dj1Me{^|?1A=@rE>?SE`KzTnS@KXmGg->s`SXdZGZC?Y#QuFCgKRk_>k z`R*gvAM0eE%fGd~=4@umTNC+Qr)DR+sNu*j4oAdmg!3cb*gCiP+b0`v3LV?IH13XV z|L6#6)8hSR{_Ar|A*%b~W3Ek4SWT9xr>?$D?g}fX0=TH##){y?)pDf3#T%Q#F4sPc zXu^#m)WO%PutM8Tle#$Uodr#^vX7l@KcI;4d`FvI?d`%N1C^`XAAZqVrDE!VZRrAv za`j+_Rnb>=?yA|G<%hR6+TCXq*Z)AG zegCML7F^n4bXw1Wv3Y*(s*ipu)3zBS`z|bzKj>D|)z{gkcIDgEN0~ZOS(?XWHlF(K zouB=s#qOh9ed6^NhpqR8s%MnEuS@f18MyNvogAvX@qK^&u}8iw9Piowh0M{|#OL0f zszbUBYi_N$oS-Y_klvEMc#-6b={im->vJ{OfoZFJ@=kO!w`e{sqgaQGHa%flJ$-xD zHpu$>p_3nNkNPBo`&JlODefpYW9hchE@+HenXYX$vpDBon1w zFW7Ry%X>wKQmyWb0PWRwyveXRI>yl~f@)5Vkp z%f6nf>p?bi&fY-kU9;2j89tlR1p9-=e<0euIRhJd)aTT#p6zDpbxv8!JHqZrlJBLR zS?)_d_n|dzci*OE)wBjq3wyVrdN4=Jk*X_Z!8ScPG?WhaDSd?RJKDsl2so^*N%e(~ zWa;3^&pDUpz&(9dWDFPa=XfQUM|^C_rWh5Ne|lcQ51AHSwq(NMyZ18J7dsDuM)z+? zEsvsJo_fa>w0dr5J_&wuCBw8j`%1jS>&lw9&gjbG{l)ILHJ_X*%F|g<_j2JeV zJMKHF8@A!N@t@v`e-sE6IO`VQ*l#dIop@=Z(Pf{vAg*=9TPi{`H^R5Iq~eZR=!x1N z2uGnoX|rRHTt(MBo4w)2Td1!#8&s#reppy4kQ*=EVAx;q1Fr_}|HOu|k^$9We z;cE{qy7afySnq6&Kk&Ctr_azn$D84<_jxNzAKktE>EdRCrwv28@kt)Goc2h%K!euP zvnYjKs@*t$PEdN>m`4#_qs>}poi#m?Ezn%Ep5mfViFyF7Lt^2R0o;zn3 z+=_X*U{}@0{NTE{Q&I2eB%PwZm~KhstbVSmVA|y&utl&W3HwZCR9G@*q9M$*c|9h0h-ftp zt!(!QaHN`=+G$b>M$8vq{r8@nFH9>E4pK@tjW+vH%74>3yOpb!7VF@yfhfIwR!xPGQW!Z#?BQrH<(fmWrva*>%9{Jx?y{ zi$ZMIiPrGT9Yb1Y*Yv1l99k{ZO{Fv`e4T-!=2dzh*0yneAX1(c6?ymTman|#qRCy` zeRwixQTdmc{1xu%xsh768p4E45g7)_$(eVaXW7_i@cpS2EslgK5-G~g>Fnw>)ffLCRR#pS9b}%JG)=a+3vOB-T#-Ny?9xi7Zqf0fu+p{oG z-_c@U<(22#UUi%r-!|!)9LT7e{_6UW%U&bD*uTY&>b-isl`VgAK%s7%!IC)|6`toK zI(yoAoVgFX@;mD2UNv7Xvq!@01dNH&-F{y;-<4hN`y}sg6JxWhzE)R#Z76DEMWg2u zvSc=3X{Q$4`pO#ev=eG1Vvz%jHx>qHdBJ3H1I&esnyKf)i=OxQ8O>_4a{bw2Gx)Hf zSQ!d+7ytDGInf9l;SL#TSn4ErI>~_#`fMb#4sNB-3+j68;4h>i!Ztw=Q&rBAxlw-wq=*lb^h(^r%-Y+#323+J^LcBDV-_?!eL?Hw4OVFL5P4E#LIR8zAFPr7p z(wi)O6DubLI^M zCS1S4hDO;?jaHvaYKDnuW+Zlk5&!NWJRcU%4bYwBz?e^ie#(1?Gn;z2>!mQ^dh}8HPEXv9B`0Yx zIZ@s|L{C2QdNK+b)w;cBBpu87uqfKbmAL7cnKa*ib}WyxMur-H!(z4U7fP~BtNfX; z4KG^~oprWeI#~WNKG?FjWodLlaDHU-Z3{Ic{YmBMhHpU@aqL!&lAF?tc4d4zUWfDy zrJtwk_?a!DtF7x%Xc$l)s>tjx{UjB8$jJ6uU|^5(;6;Vvd*-it#)7`b)^f@ZPiFV@ z`>5Ayui{s8*w+3}W@X%(uYF!NHQ`N?l^K>PZNsw9cQE^$jDpQhmay4Bb|*|3c0hwe z4V-Jd7N)2$0D-izdl}jHqSvg?@vj^(l1se=bJcZ~y^eeramsAYoiP14?9Av14-xt& zSH3@zgdHNXGt!eKG=vI*?f*Kkq!gd0%ce>EYfTw-(5=rBDlA4kCF~VvU2Xbky(5P| ztJ8<$#_>Ep8t=Px&GF;9odfIdhq3aHJ?>!&Y@?7`-lGd!1#R|XFK1KdpVpP0BNIQ* zL$|JDmzmRo6WciAY7(l-I{R84tY}wz)!e*s$E4jxGaE{&0OyonYYs4m-iB3vU;}^oGktIPgK*BFQ?YZ*$mo>~!GV!V1l&!sx|xvvZ#76&B52U$db0#G(It^zpw94Y}L`q+FHZ< z`mC&srau)%9dR{Wh)zv8@Uo-4zr}lRPXzHr%PW zYn~YTWMUoUdG58nK6Cy7~fgdXTfD(Br zTJ;Wb59RBJS?fx=_S4F#V%V!{MSl7rng&I?r1lW)n5@)4_-h?){%Q}JtAY{qz%J~E z9KZkf4MCJjYCLFFRDQAte5W!r!aDw)hPVwNM z2%{NLl>hsOYDndeSJGH~wQ$-I%7K#e;j}*h^?(cpHTj?Rh|z9D(p+RqWRhs>DHJ=t zNiwZ$#!}d*-NqmA`_H!HyCl*QXuIvirAwG8v~@FN#UH^Rz(BhII`a1v($-K%B>$_i zq^gj%PmZeh-%T}nFW?XP&KA;UQsJ)(;0;&uV=HNUD0Kc#Uxr@s;tE>8zxVzjtZysCW=GD_~{JIvl|5tW{Z#7PvTN3}L#8#TR&b8S0 i+ZJ^H758?7GJRohbsa45EBjf14-0TguA&kbTmA=~fwCq5 delta 10664 zcmZXa30zFy`^WFhnf6GU7%ha+I>=sHzkbqHDhZ`zE81uwN{h)>Uz3U^4eeT{k`QG| zLZKw2MN!I0X7oqya~z@(p;AvDBy3W@yfBNvJs@$AZo}P=qk4Zo)gaulg za1a`Tg6L+Ev5-40ln%_E;6U}dIyX2}4?KNFD*rx|1$;VjU=)S|W|zn?Py%oZV*9Xx z_Gd2+yeEkmBa79aNTS4FBJUeXl*Gj{yjsseT=Y4Bg6Sh7PbT-*0Ve4Cg#*0^eQqe5 z0dCVI@Qr>@pW6dLu@nyUB2bVuUZff*!PDY*A1kquz$0!6SQ8iJ{xwh{8kLa6`hy@BbcuT5^41)NJ943@ zD!2g+M23OVIOvY`cP3s`&zMwn@=+GBT|oj-5NuqHgZ85+P+Uu@ix}QBypPq6xL9}0 zKxq=*(0II|bQe;+mQgHFxSdp$3htePSGCOx7v=snm!slKJhqcm|8%^>Iw|-Ay8=k! z3}|DcBzm?km?X}`#Y()|Ibk9Z1-%itf$bDW4hvADNuUOT38zS4Ni+-0ODBO`2+ZNu=WKmZtdaxo0>>>3G#LU8E@4$5Otpm7H`urP6nWdP4w5~zcq;SmWK z#W8?!GY&%IP>}NsH+o=mswWsA@wrG9Z_Z$RE;L`^V!=sq4ASsLNqbAG%I72lWPT8N zqd@74$S_bMwk{F*(S4+%d6(KHY24@}ZP2@1Ou0hn7LDS-IvIwf zmBeMY&WIJYP@p#I`6_4RM%RY`{fq z6jEs{@MELMo5^LTF~Q;OVvj$=>PgcEvwU!Bl#YUP{vt!4+n&w@@iqu@m*VZe?C<-mL~7Y7A=eeOsO z1Js`*JhJt-|tpXM}_Jjm@ z5b&RofYf;wxZQyRY|7Z@nc(d!9Mqjhfz&&ZVW31DCL8hDw*DXzVVo2)K<-BzL|+iE z;&R-(=aWc8!Q5VvAznqI-aX$)^(J40tMDBM<(I@~CsD7`AW6j9!n#RR&J@O<^qBn%+Tm_Oy91n!JM-dl&uKu@o0wt0thfgF?azt4qqQHzH zGQ<-}OvhmrlKA&TzEC5He|LT6G+fjxMS;h3lIP!%S1^+#61NMY15%nK5zCRV(!@C2 zv~khp+J8eSS63v$S!Urz-FbKxT!W|JToUjpXMjEPaDZ){`f?@^EWm;4brh^xiW}J0 z&ArY7NoFK4xq<~$mXkmf1Q$^pv{#_uytT+MP$CxcI=-1oY;m!!65g0^FhGDksV>W# zED+#;1G!t`So6bA)Y|nT5d{X$qPoOWGKx3UzY!NlZi^$P85h58CY2;2hUntmZ6Xl` z2e*@|!8(Dh3E|1{6nToXTN%8oD||>LC*viF-r4z*#3{H)lw1`c5>c=&kkkgTyNC`b z29ZP}WC`)%ev(LpEFm5X!^QHu@OHo^c@iN@cv23LL?UDfvEm3xBtn)D_ebGkPmOqe z2yqcdB%4AH6h5=X@wn(y_uqQUPvK%gEee88iwpxLVx5UK zSddJrMjl@SU2LM@rJsh2*d~}<#{$mTB5xEp=86n)j1nVTdkz;1YQ=pgB5y(=NhDT` zSok*=N!1cNhv@F-%ea{Q5CtDgNYxOFqK(fFzZ4g-*(`X(0FTOX;8OqJ3FF)li6~f6 zi5t-k;(-zKZgYnu{vC>IYedzEbA4h;Cfp~9L?{xYe(*q4Eef90lRWzEKHxVp!KYRnSU17<7j3xlJ0EFcfhR9Xz_J;>?s$iTa&IR0W-}91eIkLm zPnaO84+o)7P+&MD#XFb;UkhTdtQ1<9fI^cNKa>Quu)tLb5_k{6O9B|RqTqxiu43iu zTbUr003%Pu2NoTB(*Um{Qlz?apR&MdX&j6^MFCqzWQZT<-~TCz?2sji;~~Uml@M*@ zNTR}D;zD_nIN>i*bsR40wWEMKp5!_4FVAkWNF1RW_`n`3;3ctj;XP-8uM-IH90iPt zqw&kHZ$YqQBX_KuYSaVIdU_`^o&+VhV+L_4zhNnDm$;KS2jLE z7M7Zx$8cop)~SNM?jOHgUVdd#ldaatjwn z$Of{5M|Fxn3@0J+935@SG&u5X_BC%^#NO31(s**f#p>3Hb%QSU?5Z-?yY-j)g%>tD zZqD&eN|<4=t<-PE^NXh$m)31s(Dc*$B{wAgp{wJoD|c0E`K~{*<^RoLXvN+MNWbMs zyOpnTT(PZUj-T4JlJE3@_AU*husb2|b8Y<>Y&1J&oORK)c&79xwSd?RrJo=9r!E|w zG<;^AYIxV0wY!8d?oJ77*03ML;1vL4JezM9NL>u>IS(keMU=RP%9t zH5JoU4e7(~=|SHzXx|k|v`&ZT7q13~eP1?yn%dtCV<|(D&h9$2$F(a(g$d2>|mhdkS=|MOg%kxk!2HA+vOIAqzeI#i)>&jF3E z|Lb1A>|h_9x+_yX51f_egK3lf1ao(D_9m)@UH7k2p zRQB3tMyS|6l9;-6F#AAp8^VS|`R7GtUiY`r-l_<7IA%B}c9{`VinZ^3)G@yO+4lEZ z2b~Vjg9Bg8LDD!Lr%gV5mpYpv)odq>F?ukyKiOK1gKTHhds!p)Z_~YzK=!`WRZ3%5_>)F=GG>Mz^>Nq-3+sr(GXZKI*rN z^*!5OC&(>AVy~Zn+isV;$7jrGh#ebP>G^E@dM|b58|iL4l6NWH-!$~`0RP;D=-lYI z`I-r@F0Vc*V1Mo9So_487@Fx!OWm}>iH97}s?e@2jMC=23Or}Ho}5YhR9>3@u=3GL zMtAI!RhO?&532hO?&R?xB(OY4y*` z(yxwoML+%iQ75OSLbax1TcGr)SMlp5ecDgM^C-nbV*^@=;rZszYBOZC`9BVKVv|L^ z*|A7@WvTux{tu5fc!HVLnKti-zox(~Nm-NIp{#6`9=MtcLz<$N%9h=3YFDu;gPM}+ zRs5#MoITN=qZf2LO+DGE^t;uz#+hdr-^xc%_WsPzZojO##@=yzx}4XW1+HkeuY?=( zfXDEGy>tKlcbwmZ>bJD9o1@@Wsg)_UzGyq_&MI6Adn`?JsI;by!6? z^v_G#!rc!XWEY?*e2Z0g7zfXmA6jqNE;o3kb*drb6HCe@CTRXL-qrYabdOp?fw-<(x)%4R!3=|+di<}QW&VAZa<46FI_nKCB+b9})?@07)b#lVsV=r$tLEG*Jg(}N z#in0Bu@%?)u$#hMGfazi?4`@lTN)-hUOb(Z5~}!c{~6_kuKSNT4)0@XY8q2*;?Cx- z-;v4j8O08W&&Yg^l{0oX;O9B?$|n6jA7(u5OxWpXmB>M&-9j7=%Ib$SeVh!>2WIDu z;^fwrtn3V_OV4*-Z-hR(;W2|)6jkWlU!1~D3g2+4h`Z*-p%#t1IsT^h@ltziyfwXp z{cdx6uJCTpO7RfJ9~@R-zuk1A==N6MtB*Be8P&>`-Ve)5i@rWNvRYDS;wxFDNYh`) zrgvrP9(NN};Y|uVrFV>z>Gs18K=BH#EP|^xG>=uPqIFG=G0wXX)Y> ztTVoBr?ldZx9O~vrulx!Dxj7y}YJy(QuPq!k1Z{CSEen^$p<>HDG3iEMAhMs@j;HlOf%#Y-pl0(%N!z ztUK=m)s@}TvtnQ%FE2v|yTR+i70by>?^Qvptemwe-$t8k`87hXW+cOrKjhn4cRTWa z2k$_BSovwe*EtIgelYlH*Zt1Q|JLw&eo?_n&BDQg4f{^T*l#eXsXZL`pyh;@cdd6< zn0>)GH`lV#Wnro9Zqq)+E*Kj4>7&B2iu_W2;w^jRiSg{K?xq9O@K+96FE$rWPi_$g zNn0kYtEupD>3lKmN6_)OHA5Ey+X|L5!oLi?j%U1)y=A9+@`Lw;M^$=-=D(1-lr14Y z;8wgA=fC#M@|86^>id;z+Uj(S!n$45>yny|Uy3t|Gd*uJS@BVD{0`oR(~~=uWOLi6 zjPb8Hd|`8v*DOmq@^YZlW?6dRx{Re}QY)7XKA*Qd=eCE&KH;L~u;t?yChw}PNft)= zDolIdSk|ksIY!AcZA>Hboy(DJOYvD>*^^e=vY2x=!nLzH$K3{Vq1J#IMh-pcK zYM)bOmGRFS&PrkaDdypE_N|7c4qZK~a%-9&(-Mjk*u^k4ip|9M4NRzX`u6fyQ*7gURdyLJ9ivlb&6-U|T1NQMTdxI5wM4=@u2)g=8}_*PU)WZ|cG%c?dMGSp z>XWABZ#0o?soCepUCX}R{qyJ!LFb~g=dO-gZ5zM8rhCIfVT@-)m_^u~iSuMiJ=CK@ zdZVHXKY#YsTzN8w8?fB|g{ou!ltu50*H!y6OLng_q>VH-xpmv-X*&~#h9x|Y0hUi> z+FjRmMx5b5H8Oc|REiRt@;ZVkN(ncZ4$RNCFLnQJ{a0G1wi@K+2?d3kg98~-f{uk( zv*nJ|7wS)}k;<`FiJRwdcrRmoqSEP0K1S;euD)bX9%?y+xXBxF9z7|G)DKNMBC*Th z^&3Zf#nF!4>lKc*YNfPpkRE*$otwTl_L$CCm(|aAYs4lVQT?_|CxiG0@K6H#cX_-Tn+|m?9^g zUW=syv4s?6 zdT|1lf&I?66IM$O9J#UmMzxgA-L+4~-fVi)+;VPN@053Ep`qq>u|MD3z3T4fnSw>n zoDD)3nPw%u=Jh%{WEv(u*ZHwA&?H8O5_Mj)W5p_YNhI$6Ie1Gfcx^D)vAi=s?%pqC zpocx%^usw9q}bjOZkNaHm{{LIY($XpDkRhq86gkaK(I=@=8NBcj5U$pBb+gnQ(`jm&>Y3RRk`T z`@E~L$4R+7OJa+Y`?iz_k7M*H+dd0#=4_qR=5MsV<@%4L=^w^zGwWQ~a%=9NENnh! z3o1Eg*KG~6HFWH=X@n}wXwemqEKHNx@Bv5iM^sy!{)mHThEycWKeX|OTW3;w@Y-Ue zo)y{vi-glD5~5sf7tAOejIVsQm8WlkvDmN*=4w;72s0#Wh55yOOZv>5?O4McWY<#D z+-}Q13CpN9rQac}vUlY>g%HFMJ`E{rzPwE>E)LiGEMV(4owx)NYpY+SnJ=V-RQYeS zFZDNkce!MLqECIo8{MN9&KxNWn;n>VsxLr=()zXk0{f0iVB3KrlZspR^FPl$oDxr| z?^&s#y?!ilmG+ALkMz+XP!I+6uH5ppNw>_4 zTN#q}wte*_Q#TA|P)xt(d+X=yH<6eN(_1(|1%JVZL1m@Ut~FS;3m?X`Gh}9t z6o1?ab6j%?Sk+Kd{^jr&)e$qW(5ht%j|L9zajK)P5O@LY%8B{N=)9zNO4y^v(!&>H zWSl&OIDCLytZ2-YVLZlUipqHaoZmPCnu! zy-&JXd*dAcfXA&myY|k`KKg14yppL_9mHSlM>^tUi3AhLd7qV0qIpI*fOt! zqCa2yAAQqiuJ6HLNZYn0O23e}1E)@;2d?RGwF%ub>+bC{vrc^3Ve)WBc*81*9JmEy z?J8_?%{uvyu~Qj_IDCABTOAJPqR_@-&CTbFlo2|d1?!4%&(0eq%5=({fdT#mr+6uP z8+>Zakc5d4yenHBs8K{TuTv)n?~ z7O$q6;g(eUo1F`hQR3b%%Dlp``B%hJ zu$Ww}gLeE{AzKA;?O_u+fQ-DfT9R*d|5>=M+EkO6?iJPj`JNfl*p}CYkGYxH<36&N zZf~v#?`Or`rovY?8PwEion}_MmYP)zt)?ZwAuocd#W^g*GpN|Y=V686%Z2Q`c9>Y~ zf`up62yK=oPwOyq-iy$mnf9u7+4668{Pd(Qh4%zCxa&$L!pHi@6nMjs;$4=eI7?iG ze{I(tr6@yzA1eF+fuanp+C|cB>TTU>WkH8K?K)Kk`%?~8_|fI1dC;^3wH`D-dEW9s z+Ro|6@Yg`PQ2hsfunY4v$9}*|MQ}HecALtRe+plKjReuGcx^$nDLnl`Xr92#v3lA# zo?{rzL=YEDdq=1A@b<;fOsSewUQi6pglbLYov5N|@HAZE+qql!X!1OrSeiQ}KoA>C zTL$2F!%14QJa2CXEmg2Hjb=jO6{ORy&B%q_+T;xX>kRwaWbx@wXOKzGlGry^8a^)#ye3-^Q3CPZ*u>B zV`<);D%uuG09LIjbv9K{Uq#z0Nf{N)tEH{Vk-^@B$yx+&cjE=HB;0v#d?ogh*mMs` zcit^OiFgHd=(l$3ZjbGOvJTsK?%@40lu+d*YfA?4Mt4eBMQH>`2s8pDl^SO~cUF;W z*Rr=dS{ixAEn0By9xV7byULrlL1GF|5|v^LzWz_@DVQac=C!mN|5$kp{= static_cast(AudioPreBufferMs)) + { + bPreBuffering = false; + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[Turn %d] Pre-buffer timeout (%dms). Starting playback."), + LastClosedTurnIndex, AudioPreBufferMs); + if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) + { + AudioPlaybackComponent->Play(); + } + } + } + // Silence detection. // ISSUE-8: broadcast OnAgentStoppedSpeaking OUTSIDE AudioQueueLock. // OnProceduralUnderflow (audio thread) also acquires AudioQueueLock — if we broadcast @@ -540,13 +559,32 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow( USoundWaveProcedural* InProceduralWave, const int32 SamplesRequired) { FScopeLock Lock(&AudioQueueLock); - if (AudioQueue.Num() == 0) return; - const int32 BytesRequired = SamplesRequired * sizeof(int16); - const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired); + if (AudioQueue.Num() > 0) + { + const int32 BytesRequired = SamplesRequired * sizeof(int16); + const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired); - InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush); - AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No); + InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush); + AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No); + } + else if (bAgentSpeaking) + { + // Queue is empty but agent is still speaking (TTS inter-batch gap). + // Feed a SMALL amount of silence to keep the audio component alive. + // IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of + // SamplesRequired to avoid queuing large blocks of silence in the + // audio component's internal buffer. Without this cap, multiple + // underflow calls during a TTS gap accumulate hundreds of ms of silence + // that must be played through BEFORE real audio data — causing the + // audible 1s+ pause between TTS chunks. With 32ms chunks, at most + // one small silence block sits ahead of new audio when it arrives. + constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz + const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples); + const int32 SilenceBytes = SilenceSamples * sizeof(int16); + SilenceBuffer.SetNumZeroed(SilenceBytes); + InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes); + } } void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray& PCMData) @@ -573,10 +611,50 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray 0) + { + // Pre-buffer: accumulate audio before starting playback. + // This absorbs TTS inter-chunk gaps so chunk 2 arrives before + // chunk 1 finishes playing, eliminating mid-sentence pauses. + bPreBuffering = true; + PreBufferStartTime = FPlatformTime::Seconds(); + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[Turn %d] Pre-buffering %dms before starting playback."), + LastClosedTurnIndex, AudioPreBufferMs); + } + else if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) + { + AudioPlaybackComponent->Play(); + } + } + else if (bPreBuffering) + { + // Second (or later) audio chunk arrived during pre-buffer period. + // We now have both chunks buffered — start playback immediately. + bPreBuffering = false; + const double BufferedMs = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0; + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered). Starting playback."), + LastClosedTurnIndex, BufferedMs); if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) { AudioPlaybackComponent->Play(); } + SilentTickCount = 0; + } + else + { + // Already speaking — but the audio component may have stopped due to + // buffer underrun (TTS inter-batch gap). Restart it if needed. + if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) + { + UE_LOG(LogElevenLabsAgent, Warning, + TEXT("[Turn %d] Audio component stopped during speech (buffer underrun). Restarting playback."), + LastClosedTurnIndex); + AudioPlaybackComponent->Play(); + } + // Reset silence counter — new audio arrived, we're not in a gap anymore + SilentTickCount = 0; } } @@ -592,6 +670,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio() // while holding it would block the audio thread for the full Blueprint handler duration. bool bWasSpeaking = false; double Now = 0.0; + bPreBuffering = false; // Clear pre-buffer state on stop. { FScopeLock Lock(&AudioQueueLock); AudioQueue.Empty(); diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp index bcb4adf..3a0e0b1 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp @@ -33,136 +33,136 @@ TMap> UElevenLabsLipSyncComponent::CreateVisemeToBlend // PP — bilabial (P, B, M): lips pressed together { TMap BS; - BS.Add(FName("mouthClose"), 0.7f); - BS.Add(FName("mouthPressLeft"), 0.3f); - BS.Add(FName("mouthPressRight"), 0.3f); + BS.Add(FName("mouthClose"), 0.9f); + BS.Add(FName("mouthPressLeft"), 0.5f); + BS.Add(FName("mouthPressRight"), 0.5f); Map.Add(FName("PP"), BS); } // FF — labiodental (F, V): lower lip tucked under upper teeth { TMap BS; - BS.Add(FName("mouthShrugLower"), 0.5f); - BS.Add(FName("mouthUpperUpLeft"), 0.3f); - BS.Add(FName("mouthUpperUpRight"), 0.3f); - BS.Add(FName("jawOpen"), 0.1f); + BS.Add(FName("mouthShrugLower"), 0.7f); + BS.Add(FName("mouthUpperUpLeft"), 0.4f); + BS.Add(FName("mouthUpperUpRight"), 0.4f); + BS.Add(FName("jawOpen"), 0.15f); Map.Add(FName("FF"), BS); } // TH — dental (TH): tongue between teeth { TMap BS; - BS.Add(FName("tongueOut"), 0.4f); - BS.Add(FName("jawOpen"), 0.15f); + BS.Add(FName("tongueOut"), 0.5f); + BS.Add(FName("jawOpen"), 0.2f); Map.Add(FName("TH"), BS); } // DD — alveolar (D, T, N): tongue on alveolar ridge { TMap BS; - BS.Add(FName("jawOpen"), 0.25f); - BS.Add(FName("mouthClose"), 0.2f); - BS.Add(FName("mouthLowerDownLeft"), 0.15f); - BS.Add(FName("mouthLowerDownRight"), 0.15f); + BS.Add(FName("jawOpen"), 0.35f); + BS.Add(FName("mouthClose"), 0.3f); + BS.Add(FName("mouthLowerDownLeft"), 0.25f); + BS.Add(FName("mouthLowerDownRight"), 0.25f); Map.Add(FName("DD"), BS); } // kk — velar (K, G): back of tongue raised { TMap BS; - BS.Add(FName("jawOpen"), 0.25f); - BS.Add(FName("mouthStretchLeft"), 0.15f); - BS.Add(FName("mouthStretchRight"), 0.15f); + BS.Add(FName("jawOpen"), 0.35f); + BS.Add(FName("mouthStretchLeft"), 0.25f); + BS.Add(FName("mouthStretchRight"), 0.25f); Map.Add(FName("kk"), BS); } // CH — postalveolar (CH, SH, J): tongue bunched behind alveolar ridge { TMap BS; - BS.Add(FName("mouthFunnel"), 0.45f); - BS.Add(FName("jawOpen"), 0.2f); - BS.Add(FName("mouthPucker"), 0.15f); + BS.Add(FName("mouthFunnel"), 0.65f); + BS.Add(FName("jawOpen"), 0.3f); + BS.Add(FName("mouthPucker"), 0.3f); Map.Add(FName("CH"), BS); } // SS — alveolar fricative (S, Z): air through narrow channel { TMap BS; - BS.Add(FName("mouthStretchLeft"), 0.4f); - BS.Add(FName("mouthStretchRight"), 0.4f); - BS.Add(FName("jawOpen"), 0.1f); - BS.Add(FName("mouthSmileLeft"), 0.15f); - BS.Add(FName("mouthSmileRight"), 0.15f); + BS.Add(FName("mouthStretchLeft"), 0.6f); + BS.Add(FName("mouthStretchRight"), 0.6f); + BS.Add(FName("jawOpen"), 0.15f); + BS.Add(FName("mouthSmileLeft"), 0.3f); + BS.Add(FName("mouthSmileRight"), 0.3f); Map.Add(FName("SS"), BS); } // nn — nasal (N, M, NG): soft palate lowered { TMap BS; - BS.Add(FName("jawOpen"), 0.15f); - BS.Add(FName("mouthClose"), 0.2f); - BS.Add(FName("mouthPressLeft"), 0.1f); - BS.Add(FName("mouthPressRight"), 0.1f); + BS.Add(FName("jawOpen"), 0.2f); + BS.Add(FName("mouthClose"), 0.35f); + BS.Add(FName("mouthPressLeft"), 0.2f); + BS.Add(FName("mouthPressRight"), 0.2f); Map.Add(FName("nn"), BS); } // RR — retroflex/rhotic (R, L): tongue curled or lateral { TMap BS; - BS.Add(FName("mouthFunnel"), 0.3f); - BS.Add(FName("jawOpen"), 0.2f); - BS.Add(FName("mouthRollLower"), 0.15f); + BS.Add(FName("mouthFunnel"), 0.5f); + BS.Add(FName("jawOpen"), 0.3f); + BS.Add(FName("mouthRollLower"), 0.3f); Map.Add(FName("RR"), BS); } // aa — open vowel (A as in "father"): wide open jaw { TMap BS; - BS.Add(FName("jawOpen"), 0.7f); - BS.Add(FName("mouthLowerDownLeft"), 0.4f); - BS.Add(FName("mouthLowerDownRight"), 0.4f); - BS.Add(FName("mouthShrugUpper"), 0.1f); + BS.Add(FName("jawOpen"), 0.85f); + BS.Add(FName("mouthLowerDownLeft"), 0.5f); + BS.Add(FName("mouthLowerDownRight"), 0.5f); + BS.Add(FName("mouthShrugUpper"), 0.15f); Map.Add(FName("aa"), BS); } // E — mid front vowel (E as in "bed"): mid-open, spread lips { TMap BS; - BS.Add(FName("jawOpen"), 0.4f); - BS.Add(FName("mouthSmileLeft"), 0.3f); - BS.Add(FName("mouthSmileRight"), 0.3f); - BS.Add(FName("mouthLowerDownLeft"), 0.2f); - BS.Add(FName("mouthLowerDownRight"), 0.2f); + BS.Add(FName("jawOpen"), 0.5f); + BS.Add(FName("mouthSmileLeft"), 0.5f); + BS.Add(FName("mouthSmileRight"), 0.5f); + BS.Add(FName("mouthLowerDownLeft"), 0.3f); + BS.Add(FName("mouthLowerDownRight"), 0.3f); Map.Add(FName("E"), BS); } // ih — close front vowel (I as in "sit"): narrow opening, spread lips { TMap BS; - BS.Add(FName("jawOpen"), 0.2f); - BS.Add(FName("mouthSmileLeft"), 0.25f); - BS.Add(FName("mouthSmileRight"), 0.25f); - BS.Add(FName("mouthStretchLeft"), 0.1f); - BS.Add(FName("mouthStretchRight"), 0.1f); + BS.Add(FName("jawOpen"), 0.25f); + BS.Add(FName("mouthSmileLeft"), 0.45f); + BS.Add(FName("mouthSmileRight"), 0.45f); + BS.Add(FName("mouthStretchLeft"), 0.2f); + BS.Add(FName("mouthStretchRight"), 0.2f); Map.Add(FName("ih"), BS); } // oh — mid back vowel (O as in "go"): rounded lips, open jaw { TMap BS; - BS.Add(FName("jawOpen"), 0.5f); - BS.Add(FName("mouthFunnel"), 0.5f); - BS.Add(FName("mouthLowerDownLeft"), 0.2f); - BS.Add(FName("mouthLowerDownRight"), 0.2f); + BS.Add(FName("jawOpen"), 0.6f); + BS.Add(FName("mouthFunnel"), 0.7f); + BS.Add(FName("mouthLowerDownLeft"), 0.3f); + BS.Add(FName("mouthLowerDownRight"), 0.3f); Map.Add(FName("oh"), BS); } // ou — close back vowel (OO as in "boot"): tightly rounded lips { TMap BS; - BS.Add(FName("mouthPucker"), 0.6f); - BS.Add(FName("mouthFunnel"), 0.4f); - BS.Add(FName("jawOpen"), 0.15f); + BS.Add(FName("mouthPucker"), 0.8f); + BS.Add(FName("mouthFunnel"), 0.6f); + BS.Add(FName("jawOpen"), 0.2f); Map.Add(FName("ou"), BS); } @@ -220,7 +220,20 @@ void UElevenLabsLipSyncComponent::BeginPlay() AgentComponent = Agent; AudioDataHandle = Agent->OnAgentAudioData.AddUObject( this, &UElevenLabsLipSyncComponent::OnAudioChunkReceived); - UE_LOG(LogElevenLabsLipSync, Log, TEXT("Lip sync bound to agent component on %s."), *Owner->GetName()); + + // Bind to text response delegates for text-driven lip sync. + // Partial text (streaming) provides text BEFORE audio arrives. + // Full text provides the complete sentence (arrives just after audio). + Agent->OnAgentPartialResponse.AddDynamic( + this, &UElevenLabsLipSyncComponent::OnPartialTextReceived); + Agent->OnAgentTextResponse.AddDynamic( + this, &UElevenLabsLipSyncComponent::OnTextResponseReceived); + + // Enable partial response streaming if not already enabled + Agent->bEnableAgentPartialResponse = true; + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Lip sync bound to agent component on %s (audio + text)."), *Owner->GetName()); } else { @@ -368,10 +381,17 @@ void UElevenLabsLipSyncComponent::BeginPlay() void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReason) { // Unbind from agent component - if (AgentComponent.IsValid() && AudioDataHandle.IsValid()) + if (AgentComponent.IsValid()) { - AgentComponent->OnAgentAudioData.Remove(AudioDataHandle); - AudioDataHandle.Reset(); + if (AudioDataHandle.IsValid()) + { + AgentComponent->OnAgentAudioData.Remove(AudioDataHandle); + AudioDataHandle.Reset(); + } + AgentComponent->OnAgentPartialResponse.RemoveDynamic( + this, &UElevenLabsLipSyncComponent::OnPartialTextReceived); + AgentComponent->OnAgentTextResponse.RemoveDynamic( + this, &UElevenLabsLipSyncComponent::OnTextResponseReceived); } AgentComponent.Reset(); SpectrumAnalyzer.Reset(); @@ -388,31 +408,156 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick { Super::TickComponent(DeltaTime, TickType, ThisTickFunction); - // Smooth viseme weights towards targets using exponential interpolation - const float Alpha = FMath::Clamp(DeltaTime * SmoothingSpeed, 0.0f, 1.0f); + // ── Consume queued viseme analysis frames at the FFT window rate ───────── + // Each 512-sample FFT window at 16kHz = 32ms of audio. + // We consume one queued frame every 32ms to match the original audio timing. + constexpr float WindowDuration = 512.0f / 16000.0f; // ~0.032s + + // Pre-buffer sync: don't consume viseme queue while the agent component is + // pre-buffering audio. This keeps lip sync in sync with audio playback. + // Without this, the lip sync would start 250ms ahead of the audio. + if (AgentComponent.IsValid() && AgentComponent->IsPreBuffering()) + { + return; + } + + // Wait-for-text: hold playback until text arrives so all frames get proper + // text-driven visemes. Timeout after 500ms and start with spectral shapes. + if (bWaitingForText) + { + const double WaitElapsed = FPlatformTime::Seconds() - WaitingForTextStartTime; + if (WaitElapsed >= 0.5) + { + // Timeout — start playback with spectral shapes as fallback + bWaitingForText = false; + PlaybackTimer = 0.0f; + UE_LOG(LogElevenLabsLipSync, Warning, + TEXT("Text wait timeout (%.0fms). Starting lip sync with spectral shapes (Queue=%d)."), + WaitElapsed * 1000.0, VisemeQueue.Num()); + } + else + { + // Still waiting — keep timer frozen, skip consumption + PlaybackTimer = 0.0f; + } + } + + PlaybackTimer += DeltaTime; + + while (PlaybackTimer >= WindowDuration && VisemeQueue.Num() > 0) + { + LastConsumedVisemes = VisemeQueue[0]; + TargetVisemes = VisemeQueue[0]; + VisemeQueue.RemoveAt(0); + if (AmplitudeQueue.Num() > 0) AmplitudeQueue.RemoveAt(0); + PlaybackTimer -= WindowDuration; + } + + // ── Inter-frame interpolation ───────────────────────────────────────── + // Instead of holding the same TargetVisemes for 32ms then jumping to the + // next frame, blend smoothly between the last consumed frame and the next + // queued frame. This prevents the "frantic" look from step-wise changes + // and creates continuous, natural-looking mouth motion. + if (VisemeQueue.Num() > 0 && LastConsumedVisemes.Num() > 0) + { + const float T = FMath::Clamp(PlaybackTimer / WindowDuration, 0.0f, 1.0f); + for (const FName& Name : VisemeNames) + { + const float From = LastConsumedVisemes.FindRef(Name); + const float To = VisemeQueue[0].FindRef(Name); + TargetVisemes.FindOrAdd(Name) = FMath::Lerp(From, To, T); + } + } + + // If queue runs dry, decay towards silence and reset text state + if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f) + { + for (const FName& Name : VisemeNames) + { + TargetVisemes.FindOrAdd(Name) = 0.0f; + } + TargetVisemes.FindOrAdd(FName("sil")) = 1.0f; + PlaybackTimer = 0.0f; + + // Reset text state — but ONLY after the full response (agent_response) + // has arrived AND text was applied. This prevents destroying text between + // audio chunks of the SAME utterance: partial text arrives once, but + // ElevenLabs splits the audio into 2-3 chunks with gaps. Without + // bFullTextReceived, the text is erased after chunk 1's queue empties, + // leaving chunks 2-3 without text visemes (spectral fallback only). + if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived) + { + AccumulatedText.Reset(); + TextVisemeSequence.Reset(); + bTextVisemesApplied = false; + bFullTextReceived = false; + } + } + + // ── Asymmetric smoothing ───────────────────────────────────────────────── + // At SmoothingSpeed=15: AttackSpeed=15 → alpha=0.24/frame, ~4 frames to 70%. + // ReleaseSpeed=7.5 → alpha=0.12/frame, ~9 frames to 70%. Mouth opens quickly, + // closes more gradually for natural-looking speech. + const float AttackSpeed = SmoothingSpeed * 1.0f; + const float ReleaseSpeed = SmoothingSpeed * 0.65f; bool bAnyNonZero = false; for (const FName& Name : VisemeNames) { float& Current = SmoothedVisemes.FindOrAdd(Name); - const float Target = TargetVisemes.FindOrAdd(Name); + const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength; - Current = FMath::Lerp(Current, Target * LipSyncStrength, Alpha); + const float Speed = (Target > Current) ? AttackSpeed : ReleaseSpeed; + const float Alpha = FMath::Clamp(DeltaTime * Speed, 0.0f, 1.0f); + + Current = FMath::Lerp(Current, Target, Alpha); // Snap to zero to avoid infinite tiny values if (Current < 0.001f) Current = 0.0f; if (Current > 0.001f) bAnyNonZero = true; } - // "sil" uses LipSyncStrength=1 always — it's the rest pose - SmoothedVisemes.FindOrAdd(FName("sil")) = FMath::Lerp( - SmoothedVisemes.FindOrAdd(FName("sil")), - TargetVisemes.FindOrAdd(FName("sil")), - Alpha); + // Periodic viseme activity log (Verbose — enable with log verbosity for debugging) + static int32 TickLogCount = 0; + if (++TickLogCount % 30 == 1) + { + FName DominantViseme = FName("sil"); + float DominantWeight = 0.0f; + for (const FName& Name : VisemeNames) + { + const float W = SmoothedVisemes.FindOrAdd(Name); + if (W > DominantWeight) + { + DominantWeight = W; + DominantViseme = Name; + } + } + + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("LipSync: Queue=%d Viseme=%s(%.2f)"), + VisemeQueue.Num(), *DominantViseme.ToString(), DominantWeight); + } // Convert visemes to ARKit blendshapes MapVisemesToBlendshapes(); + // ── Additional blendshape-level smoothing ───────────────────────────── + // A second smoothing pass on the final ARKit blendshape values removes + // residual jitter from the OVR→ARKit mapping step. This is lighter than + // the viseme-level smoothing and provides a natural "soft" look. + { + const float BSmoothAlpha = FMath::Clamp(DeltaTime * SmoothingSpeed * 0.4f, 0.0f, 1.0f); + for (auto& Pair : CurrentBlendshapes) + { + const float* Prev = PreviousBlendshapes.Find(Pair.Key); + if (Prev) + { + Pair.Value = FMath::Lerp(*Prev, Pair.Value, BSmoothAlpha); + } + } + PreviousBlendshapes = CurrentBlendshapes; + } + // Auto-apply morph targets if a target mesh is set if (TargetMesh) { @@ -438,11 +583,10 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray& PCMD const int16* Samples = reinterpret_cast(PCMData.GetData()); const int32 NumSamples = PCMData.Num() / sizeof(int16); - // DEBUG: log first audio chunk received static bool bFirstChunkLogged = false; if (!bFirstChunkLogged) { - UE_LOG(LogElevenLabsLipSync, Log, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples); + UE_LOG(LogElevenLabsLipSync, Verbose, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples); bFirstChunkLogged = true; } @@ -452,14 +596,819 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray& PCMD FloatBuffer.Add(static_cast(Samples[i]) / 32768.0f); } - // Feed to rolling FFT analyzer + // ── STEP 1: ONE spectral analysis for the whole chunk (SHAPE) ───────── + // The FSpectrumAnalyzer's ring buffer returns nearly identical results for + // sequential 512-sample pushes. So we analyze the chunk as a whole to + // determine which mouth shape (viseme blend) to use. SpectrumAnalyzer->PushAudio(FloatBuffer.GetData(), NumSamples); + SpectrumAnalyzer->PerformAnalysisIfPossible(true); + AnalyzeSpectrum(); // Sets TargetVisemes with shape-only weights (~1.0) - // Try to perform analysis (returns true when enough data for one FFT window) - if (SpectrumAnalyzer->PerformAnalysisIfPossible(true)) + // Save the spectral shape for this chunk + TMap ChunkShape = TargetVisemes; + + // ── Late start fix: when queue was empty, delay playback to wait for text ── + // Partial text usually arrives 50-100ms before audio, but sometimes audio + // comes first. A small delay gives text time to arrive and be applied to + // the first frames, preventing mute mouth at utterance start. + const bool bQueueWasEmpty = (VisemeQueue.Num() == 0); + + // ── STEP 2: Per-window amplitude + ZCR (DYNAMICS + VARIATION) ───────── + // For each 512-sample window (~32ms), compute: + // - RMS amplitude: captures syllable rhythm (natural opening/closing) + // - Zero-crossing rate: detects sibilants/fricatives within the chunk + // The shape (which visemes) stays constant per chunk, but the amplitude + // (how much) varies per window, creating realistic speech dynamics. + constexpr int32 WindowSize = 512; + int32 WindowsQueued = 0; + float MinAmp = 1.0f, MaxAmp = 0.0f; // For debug logging + + for (int32 Offset = 0; Offset + WindowSize <= NumSamples; Offset += WindowSize) { - AnalyzeSpectrum(); + // RMS amplitude for this window + float SumSquares = 0.0f; + int32 ZeroCrossings = 0; + for (int32 i = 0; i < WindowSize; ++i) + { + const float S = FloatBuffer[Offset + i]; + SumSquares += S * S; + if (i > 0 && ((S >= 0.0f) != (FloatBuffer[Offset + i - 1] >= 0.0f))) + ZeroCrossings++; + } + + const float WindowRMS = FMath::Sqrt(SumSquares / static_cast(WindowSize)); + const float ZCR = static_cast(ZeroCrossings) / static_cast(WindowSize - 1); + + // Normalize amplitude: typical speech RMS at 16-bit is 0.02-0.15. + // Scale up and apply power curve for dynamic range compression. + // pow(0.4) compresses more than sqrt (0.5): quiet parts become more + // visible while loud parts are slightly reduced. This ensures the + // first part of a TTS response (often quieter) has adequate lip movement. + float Amplitude = FMath::Clamp(WindowRMS * 10.0f, 0.0f, 1.5f); + Amplitude = FMath::Clamp(FMath::Pow(Amplitude, 0.4f), 0.0f, 1.0f); + + // Apply user-configurable amplitude attenuation (AmplitudeScale 0-1). + // This reduces overall mouth movement intensity without changing the + // viseme shape, giving control over "how much" the mouth opens. + Amplitude *= AmplitudeScale; + + MinAmp = FMath::Min(MinAmp, Amplitude); + MaxAmp = FMath::Max(MaxAmp, Amplitude); + + // Build this window's viseme frame + TMap WindowVisemes; + + if (Amplitude < 0.08f) + { + // Silence — mouth closed (between syllables / pauses) + for (const FName& Name : VisemeNames) + WindowVisemes.Add(Name, 0.0f); + WindowVisemes.FindOrAdd(FName("sil")) = 1.0f; + } + else + { + // Active speech — determine shape and scale by amplitude + + // High ZCR (>0.15) suggests fricative/sibilant energy. + // This provides within-chunk shape variation: voiced segments + // use the spectral shape, fricative segments override to SS/FF. + if (ZCR > 0.15f) + { + for (const FName& Name : VisemeNames) + WindowVisemes.Add(Name, 0.0f); + + float SibStrength = FMath::Clamp((ZCR - 0.15f) * 5.0f, 0.0f, 1.0f); + WindowVisemes.FindOrAdd(FName("SS")) = SibStrength * Amplitude; + WindowVisemes.FindOrAdd(FName("FF")) = (1.0f - SibStrength) * Amplitude * 0.5f; + WindowVisemes.FindOrAdd(FName("ih")) = (1.0f - SibStrength) * Amplitude * 0.3f; + + // Blend in the chunk shape at reduced weight for non-sibilant visemes + for (const FName& Name : VisemeNames) + { + if (Name != FName("SS") && Name != FName("FF") && Name != FName("ih") && Name != FName("sil")) + { + WindowVisemes.FindOrAdd(Name) += ChunkShape.FindRef(Name) * Amplitude * (1.0f - SibStrength) * 0.4f; + } + } + } + else + { + // Voiced segment — use chunk spectral shape scaled by amplitude. + // This creates the primary speech animation: syllable rhythm + // from amplitude, mouth shape from spectral analysis. + for (const FName& Name : VisemeNames) + { + if (Name == FName("sil")) + { + WindowVisemes.Add(Name, 0.0f); + } + else + { + WindowVisemes.Add(Name, ChunkShape.FindRef(Name) * Amplitude); + } + } + } + } + + VisemeQueue.Add(WindowVisemes); + AmplitudeQueue.Add(Amplitude); + WindowsQueued++; } + + // ── Pseudo-speech fallback (no text available) ────────────────────── + // When text visemes are not available (server doesn't send partial text, + // or text arrives much later than audio), create natural-looking mouth + // movement by cycling through vowel/consonant shapes at speech rate. + // This is MUCH better than the single spectral shape (one shape for the + // entire chunk). If text arrives later, ApplyTextVisemesToQueue() will + // overwrite these frames with proper text-driven visemes. + if (TextVisemeSequence.Num() == 0 && WindowsQueued > 0) + { + // Vowel/consonant alternation at ~5 syllables/second. + // Each "syllable" = 3 frames vowel + 2 frames consonant = 5 frames × 32ms = 160ms. + static const FName VowelShapes[] = { FName("aa"), FName("oh"), FName("E"), FName("ih"), FName("ou") }; + static const FName ConsonantShapes[] = { FName("nn"), FName("PP"), FName("DD"), FName("kk"), FName("RR") }; + constexpr int32 NumShapes = 5; + constexpr int32 VowelFrames = 3; // ~96ms open + constexpr int32 ConsonantFrames = 2; // ~64ms transition + constexpr int32 SyllableFrames = VowelFrames + ConsonantFrames; // ~160ms + + int32 StartIdx = VisemeQueue.Num() - WindowsQueued; + int32 ActiveCount = 0; + int32 PseudoCount = 0; + + for (int32 Idx = StartIdx; Idx < VisemeQueue.Num() && Idx < AmplitudeQueue.Num(); ++Idx) + { + const float Amp = AmplitudeQueue[Idx]; + if (Amp < 0.08f) continue; // Keep silent frames as-is + + const int32 SyllableIdx = ActiveCount / SyllableFrames; + const int32 FrameInSyllable = ActiveCount % SyllableFrames; + const int32 ShapeIdx = SyllableIdx % NumShapes; + + TMap& Frame = VisemeQueue[Idx]; + for (const FName& Name : VisemeNames) + Frame.FindOrAdd(Name) = 0.0f; + + if (FrameInSyllable < VowelFrames) + { + // Vowel phase — mouth open + const FName Vowel = VowelShapes[ShapeIdx]; + Frame.FindOrAdd(Vowel) = Amp; + + // Anticipatory blend in last vowel frame towards consonant + if (FrameInSyllable == VowelFrames - 1) + { + const FName Consonant = ConsonantShapes[ShapeIdx]; + Frame.FindOrAdd(Vowel) = Amp * 0.7f; + Frame.FindOrAdd(Consonant) = Amp * 0.3f; + } + } + else + { + // Consonant/transition phase — mouth partially closed + const FName Consonant = ConsonantShapes[ShapeIdx]; + Frame.FindOrAdd(Consonant) = Amp * 0.7f; + + // Anticipatory blend towards next vowel in last consonant frame + if (FrameInSyllable == SyllableFrames - 1) + { + const int32 NextShapeIdx = (SyllableIdx + 1) % NumShapes; + const FName NextVowel = VowelShapes[NextShapeIdx]; + Frame.FindOrAdd(NextVowel) = Amp * 0.3f; + } + } + + ActiveCount++; + PseudoCount++; + } + + if (PseudoCount > 0) + { + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Pseudo-speech: %d active frames (%d syllables)"), + PseudoCount, (PseudoCount + SyllableFrames - 1) / SyllableFrames); + } + } + + // ── Late start fix + wait-for-text ─────────────────────────────────── + // When a new utterance begins (queue was empty): + // 1) Override leading silent frames (TTS fade-in) with minimum amplitude + // 2) If text hasn't arrived yet, hold playback until it does (max 500ms) + // This ensures ALL frames get text-driven visemes from the start. + if (bQueueWasEmpty && WindowsQueued > 0) + { + // Override leading silent frames with minimum amplitude + constexpr float MinStartAmplitude = 0.15f; + int32 FixedCount = 0; + for (int32 Idx = 0; Idx < VisemeQueue.Num() && Idx < AmplitudeQueue.Num(); ++Idx) + { + if (AmplitudeQueue[Idx] >= 0.08f) + break; // Stop at first naturally active frame + + AmplitudeQueue[Idx] = MinStartAmplitude; + TMap& Frame = VisemeQueue[Idx]; + for (const FName& Name : VisemeNames) + { + Frame.FindOrAdd(Name) = (Name == FName("sil")) + ? 0.0f + : ChunkShape.FindRef(Name) * MinStartAmplitude; + } + FixedCount++; + } + + if (FixedCount > 0) + { + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Late start fix: overrode %d leading silent frames with min amplitude %.2f"), + FixedCount, MinStartAmplitude); + } + + // If text is already available (from partial responses arriving before audio), + // apply it immediately and start playback. + // Otherwise, hold playback until text arrives (wait-for-text mechanism). + if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() >= 3) + { + // Text already available — apply and start playback immediately + ApplyTextVisemesToQueue(); + PlaybackTimer = 0.0f; + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Text already available (%d visemes). Starting lip sync immediately."), + TextVisemeSequence.Num()); + } + else + { + // No text yet — hold playback until text arrives or timeout + bWaitingForText = true; + WaitingForTextStartTime = FPlatformTime::Seconds(); + PlaybackTimer = 0.0f; + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Waiting for text before starting lip sync (%d frames queued)."), + WindowsQueued); + } + } + else if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() > 0) + { + // Not a new utterance but text is available — apply to new frames + ApplyTextVisemesToQueue(); + } + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Audio chunk: %d samples → %d windows | Amp=[%.2f-%.2f] | Queue=%d (%.1fs) | TextVisemes=%d"), + NumSamples, WindowsQueued, + MinAmp, MaxAmp, VisemeQueue.Num(), + VisemeQueue.Num() * (512.0f / 16000.0f), TextVisemeSequence.Num()); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Text-driven lip sync +// ───────────────────────────────────────────────────────────────────────────── + +void UElevenLabsLipSyncComponent::OnPartialTextReceived(const FString& PartialText) +{ + // If the previous utterance's full text was already received, + // this partial text belongs to a NEW utterance — start fresh. + if (bFullTextReceived) + { + AccumulatedText.Reset(); + TextVisemeSequence.Reset(); + bTextVisemesApplied = false; + bFullTextReceived = false; + } + + // Accumulate streaming text fragments (arrive BEFORE audio) + AccumulatedText += PartialText; + + // Convert accumulated text to viseme sequence progressively + ConvertTextToVisemes(AccumulatedText); + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Partial text: \"%s\" → %d visemes (accumulated: \"%s\")"), + *PartialText, TextVisemeSequence.Num(), *AccumulatedText); + + // If we were waiting for text to arrive before starting playback, + // apply text visemes to queued frames and start consuming. + if (bWaitingForText && TextVisemeSequence.Num() >= 3) + { + if (VisemeQueue.Num() > 0) + { + ApplyTextVisemesToQueue(); + } + bWaitingForText = false; + PlaybackTimer = 0.0f; // Start consuming now + const double WaitElapsed = FPlatformTime::Seconds() - WaitingForTextStartTime; + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Text arrived after %.0fms wait. Starting lip sync playback."), + WaitElapsed * 1000.0); + } +} + +void UElevenLabsLipSyncComponent::OnTextResponseReceived(const FString& ResponseText) +{ + // Full text arrived — use it as the definitive source + bFullTextReceived = true; + AccumulatedText = ResponseText; + ConvertTextToVisemes(ResponseText); + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Full text: \"%s\" → %d visemes"), *ResponseText, TextVisemeSequence.Num()); + + // Apply to any remaining queued frames + if (VisemeQueue.Num() > 0) + { + ApplyTextVisemesToQueue(); + } + + // If we were waiting for text to arrive before starting playback, start now + if (bWaitingForText) + { + bWaitingForText = false; + PlaybackTimer = 0.0f; + const double WaitElapsed = FPlatformTime::Seconds() - WaitingForTextStartTime; + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Full text arrived after %.0fms wait. Starting lip sync playback."), + WaitElapsed * 1000.0); + } + + // Log the viseme sequence for debugging + { + FString VisSeq; + int32 Count = 0; + for (const FName& V : TextVisemeSequence) + { + if (Count > 0) VisSeq += TEXT(" "); + VisSeq += V.ToString(); + if (++Count >= 30) { VisSeq += TEXT(" ..."); break; } + } + UE_LOG(LogElevenLabsLipSync, Verbose, TEXT("Viseme sequence: [%s]"), *VisSeq); + } + + // NOTE: Do NOT reset bTextVisemesApplied here. It's reset in TickComponent + // when the queue empties AFTER the text has been consumed. Resetting it here + // would prevent TickComponent from cleaning up AccumulatedText, which then + // persists and corrupts the next utterance's partial text accumulation. +} + +void UElevenLabsLipSyncComponent::ConvertTextToVisemes(const FString& Text) +{ + TextVisemeSequence.Reset(); + + // Lowercase for matching + FString Lower = Text.ToLower(); + + // Process character by character, checking multi-char graphemes first. + // This supports French and English phoneme-to-viseme mapping. + int32 i = 0; + while (i < Lower.Len()) + { + TCHAR C = Lower[i]; + TCHAR C1 = (i + 1 < Lower.Len()) ? Lower[i + 1] : 0; + TCHAR C2 = (i + 2 < Lower.Len()) ? Lower[i + 2] : 0; + + // ── 3-char graphemes ────────────────────────────────────────────── + if (C == 'e' && C1 == 'a' && C2 == 'u') + { + // eau → /o/ (oh) + TextVisemeSequence.Add(FName("oh")); + i += 3; continue; + } + if (C == 'a' && C1 == 'i' && C2 == 'n') + { + // ain → /ɛ̃/ (E nasal) + TextVisemeSequence.Add(FName("E")); + i += 3; continue; + } + if (C == 'e' && C1 == 'i' && C2 == 'n') + { + // ein → /ɛ̃/ (E nasal) + TextVisemeSequence.Add(FName("E")); + i += 3; continue; + } + if (C == 'o' && C1 == 'e' && C2 == 'u') + { + // oeu → /ø/ (oh-like) + TextVisemeSequence.Add(FName("oh")); + i += 3; continue; + } + + // ── 2-char graphemes ────────────────────────────────────────────── + if (C == 'o' && C1 == 'u') + { + // ou → /u/ (ou) + TextVisemeSequence.Add(FName("ou")); + i += 2; continue; + } + if (C == 'o' && C1 == 'i') + { + // oi → /wa/ (ou + aa) + TextVisemeSequence.Add(FName("ou")); + TextVisemeSequence.Add(FName("aa")); + i += 2; continue; + } + if (C == 'o' && C1 == 'n') + { + // on → /ɔ̃/ (oh nasal) + TextVisemeSequence.Add(FName("oh")); + i += 2; continue; + } + if (C == 'o' && C1 == 'm' && (C2 == 0 || !FChar::IsAlpha(C2))) + { + // om at end → /ɔ̃/ (oh nasal) + TextVisemeSequence.Add(FName("oh")); + i += 2; continue; + } + if (C == 'a' && (C1 == 'n' || C1 == 'm')) + { + // an, am → /ɑ̃/ (aa nasal) + TextVisemeSequence.Add(FName("aa")); + i += 2; continue; + } + if (C == 'e' && C1 == 'n') + { + // en → /ɑ̃/ (aa nasal, French) + TextVisemeSequence.Add(FName("aa")); + i += 2; continue; + } + if (C == 'e' && C1 == 'm' && (C2 == 0 || !FChar::IsAlpha(C2))) + { + // em at end → /ɑ̃/ + TextVisemeSequence.Add(FName("aa")); + i += 2; continue; + } + if (C == 'i' && (C1 == 'n' || C1 == 'm')) + { + // in, im → /ɛ̃/ + TextVisemeSequence.Add(FName("ih")); + i += 2; continue; + } + if (C == 'u' && C1 == 'n') + { + // un → /œ̃/ + TextVisemeSequence.Add(FName("ih")); + i += 2; continue; + } + if (C == 'a' && C1 == 'u') + { + // au → /o/ + TextVisemeSequence.Add(FName("oh")); + i += 2; continue; + } + if (C == 'a' && C1 == 'i') + { + // ai → /ɛ/ + TextVisemeSequence.Add(FName("E")); + i += 2; continue; + } + if (C == 'e' && C1 == 'i') + { + // ei → /ɛ/ + TextVisemeSequence.Add(FName("E")); + i += 2; continue; + } + if (C == 'e' && C1 == 'u') + { + // eu → /ø/ + TextVisemeSequence.Add(FName("oh")); + i += 2; continue; + } + if (C == 'c' && C1 == 'h') + { + // ch → /ʃ/ + TextVisemeSequence.Add(FName("CH")); + i += 2; continue; + } + if (C == 's' && C1 == 'h') + { + // sh → /ʃ/ + TextVisemeSequence.Add(FName("CH")); + i += 2; continue; + } + if (C == 'g' && C1 == 'n') + { + // gn → /ɲ/ + TextVisemeSequence.Add(FName("nn")); + i += 2; continue; + } + if (C == 'p' && C1 == 'h') + { + // ph → /f/ + TextVisemeSequence.Add(FName("FF")); + i += 2; continue; + } + if (C == 't' && C1 == 'h') + { + // th → /θ/ + TextVisemeSequence.Add(FName("TH")); + i += 2; continue; + } + if (C == 'q' && C1 == 'u') + { + // qu → /k/ + TextVisemeSequence.Add(FName("kk")); + i += 2; continue; + } + if (C == 'l' && C1 == 'l') + { + // ll → /l/ (single) + TextVisemeSequence.Add(FName("RR")); + i += 2; continue; + } + if (C == 's' && C1 == 's') + { + // ss → /s/ + TextVisemeSequence.Add(FName("SS")); + i += 2; continue; + } + if (C == 'm' && C1 == 'm') + { + // mm → /m/ + TextVisemeSequence.Add(FName("PP")); + i += 2; continue; + } + if (C == 'n' && C1 == 'n') + { + // nn → /n/ + TextVisemeSequence.Add(FName("nn")); + i += 2; continue; + } + if (C == 't' && C1 == 't') + { + // tt → /t/ + TextVisemeSequence.Add(FName("DD")); + i += 2; continue; + } + if (C == 'c' && (C1 == 'e' || C1 == 'i' || C1 == 'y')) + { + // ce, ci, cy → /s/ + TextVisemeSequence.Add(FName("SS")); + i += 1; continue; // Only consume the 'c', let the vowel be processed next + } + if (C == 'g' && (C1 == 'e' || C1 == 'i' || C1 == 'y')) + { + // ge, gi, gy → /ʒ/ + TextVisemeSequence.Add(FName("CH")); + i += 1; continue; + } + + // ── French silent letters at end of word ────────────────────────── + // In French, final s, t, d, x, z are typically silent. + // Examples: "vous" → /vu/, "comment" → /kɔmɑ̃/, "allez" → /ale/ + { + bool bIsWordFinal = (i + 1 >= Lower.Len()) || !FChar::IsAlpha(Lower[i + 1]); + + // Silent final consonants + if (bIsWordFinal && (C == 's' || C == 't' || C == 'd' || C == 'x' || C == 'z')) + { + i++; continue; + } + + // e muet (silent 'e') at end of word — not é, è, ê + // Plain 'e' at end of a word is usually silent in French. + // Accented variants (é=0xE9, è=0xE8, ê=0xEA) are always pronounced. + if (C == 'e' && bIsWordFinal && i > 0 && FChar::IsAlpha(Lower[i - 1])) + { + i++; continue; + } + } + + // ── Single characters ───────────────────────────────────────────── + switch (C) + { + // Vowels + case 'a': case TCHAR(0xE0): case TCHAR(0xE2): // a, à, â + TextVisemeSequence.Add(FName("aa")); break; + case 'e': case TCHAR(0xE9): case TCHAR(0xE8): case TCHAR(0xEA): // e, é, è, ê + TextVisemeSequence.Add(FName("E")); break; + case 'i': case TCHAR(0xEE): case TCHAR(0xEF): // i, î, ï + TextVisemeSequence.Add(FName("ih")); break; + case 'o': case TCHAR(0xF4): // o, ô + TextVisemeSequence.Add(FName("oh")); break; + case 'u': case TCHAR(0xFB): case TCHAR(0xFC): // u, û, ü + TextVisemeSequence.Add(FName("ou")); break; + case 'y': + TextVisemeSequence.Add(FName("ih")); break; + + // Consonants + case 'b': + TextVisemeSequence.Add(FName("PP")); break; + case 'c': case 'k': case 'q': + TextVisemeSequence.Add(FName("kk")); break; + case 'd': + TextVisemeSequence.Add(FName("DD")); break; + case 'f': + TextVisemeSequence.Add(FName("FF")); break; + case 'g': + TextVisemeSequence.Add(FName("kk")); break; + case 'h': + // Silent in French, aspirated in English — skip + break; + case 'j': + TextVisemeSequence.Add(FName("CH")); break; + case 'l': + TextVisemeSequence.Add(FName("RR")); break; + case 'm': + TextVisemeSequence.Add(FName("PP")); break; + case 'n': + TextVisemeSequence.Add(FName("nn")); break; + case 'p': + TextVisemeSequence.Add(FName("PP")); break; + case 'r': + TextVisemeSequence.Add(FName("RR")); break; + case 's': + TextVisemeSequence.Add(FName("SS")); break; + case 't': + TextVisemeSequence.Add(FName("DD")); break; + case 'v': + TextVisemeSequence.Add(FName("FF")); break; + case 'w': + TextVisemeSequence.Add(FName("ou")); break; + case 'x': + TextVisemeSequence.Add(FName("kk")); + TextVisemeSequence.Add(FName("SS")); break; + case 'z': + TextVisemeSequence.Add(FName("SS")); break; + + // Space / punctuation → silence + case ' ': case ',': case '.': case '!': case '?': case ';': case ':': + case '-': case '\n': case '\r': + TextVisemeSequence.Add(FName("sil")); break; + + default: + // Unknown character — skip + break; + } + + i++; + } + + // ── Post-processing: merge consecutive silence entries ──────────────── + // "Bonjour, " generates two sil (comma + space). Collapse to one. + { + TArray Merged; + Merged.Reserve(TextVisemeSequence.Num()); + for (const FName& V : TextVisemeSequence) + { + if (V == FName("sil") && Merged.Num() > 0 && Merged.Last() == FName("sil")) + continue; // Skip duplicate sil + Merged.Add(V); + } + // Also strip leading/trailing sil + while (Merged.Num() > 0 && Merged[0] == FName("sil")) + Merged.RemoveAt(0); + while (Merged.Num() > 0 && Merged.Last() == FName("sil")) + Merged.RemoveAt(Merged.Num() - 1); + + TextVisemeSequence = MoveTemp(Merged); + } +} + +// Duration weights for viseme types. +// Vowels naturally last longer than consonants in speech. +// These weights control how many audio frames each viseme occupies. +static float GetVisemeDurationWeight(const FName& Viseme) +{ + // Vowels — sustained, mouth held open: ~100-150ms + if (Viseme == FName("aa") || Viseme == FName("oh") || Viseme == FName("E")) + return 2.0f; + if (Viseme == FName("ih") || Viseme == FName("ou")) + return 1.7f; + + // Liquids / nasals — semi-sustained: ~60-100ms + if (Viseme == FName("RR") || Viseme == FName("nn")) + return 1.5f; + + // Fricatives — moderate duration: ~60-80ms + if (Viseme == FName("SS") || Viseme == FName("FF") || Viseme == FName("CH") || Viseme == FName("TH")) + return 1.2f; + + // Plosives — short closure: ~50-70ms (not too short to avoid frantic look) + if (Viseme == FName("PP") || Viseme == FName("DD") || Viseme == FName("kk")) + return 0.8f; + + // Silence — brief pause between words (keep short to avoid frozen look) + if (Viseme == FName("sil")) + return 1.0f; + + return 1.0f; +} + +void UElevenLabsLipSyncComponent::ApplyTextVisemesToQueue() +{ + if (TextVisemeSequence.Num() == 0 || VisemeQueue.Num() == 0) return; + + // Count non-silent frames (amplitude > threshold) in the queue + int32 ActiveFrames = 0; + for (int32 Idx = 0; Idx < AmplitudeQueue.Num(); ++Idx) + { + if (AmplitudeQueue[Idx] >= 0.08f) + ActiveFrames++; + } + + if (ActiveFrames == 0) return; + + // ── Duration-weighted distribution ──────────────────────────────────── + // Vowels get more frames than consonants, creating natural timing where + // the mouth lingers on open vowels and quickly transitions through plosives. + + // Compute total weighted duration of the viseme sequence + float TotalWeight = 0.0f; + for (const FName& V : TextVisemeSequence) + { + TotalWeight += GetVisemeDurationWeight(V); + } + + // Build a cumulative weight array for mapping frame index → viseme index. + // CumulativeWeight[i] = sum of weights from viseme 0..i-1 + TArray CumulativeWeight; + CumulativeWeight.SetNum(TextVisemeSequence.Num() + 1); + CumulativeWeight[0] = 0.0f; + for (int32 V = 0; V < TextVisemeSequence.Num(); ++V) + { + CumulativeWeight[V + 1] = CumulativeWeight[V] + GetVisemeDurationWeight(TextVisemeSequence[V]); + } + + // For each active audio frame, find which viseme it maps to based on + // its proportional position in the weighted timeline. + int32 ActiveIdx = 0; + for (int32 Idx = 0; Idx < VisemeQueue.Num() && Idx < AmplitudeQueue.Num(); ++Idx) + { + const float Amp = AmplitudeQueue[Idx]; + + if (Amp < 0.08f) + { + // Silent frame — keep as silence + continue; + } + + // Where are we in the weighted timeline? (0..TotalWeight) + const float TimelinePos = (static_cast(ActiveIdx) / static_cast(ActiveFrames)) * TotalWeight; + + // Find which viseme this position falls into (binary-style search) + int32 VisemeIdx = 0; + for (int32 V = 0; V < TextVisemeSequence.Num(); ++V) + { + if (TimelinePos >= CumulativeWeight[V] && TimelinePos < CumulativeWeight[V + 1]) + { + VisemeIdx = V; + break; + } + VisemeIdx = V; // Fallback to last + } + + const FName TextViseme = TextVisemeSequence[VisemeIdx]; + + // Blend progress within current viseme (0..1) + const float VisemeStart = CumulativeWeight[VisemeIdx]; + const float VisemeDuration = CumulativeWeight[VisemeIdx + 1] - VisemeStart; + const float LocalProgress = (VisemeDuration > 0.01f) + ? FMath::Clamp((TimelinePos - VisemeStart) / VisemeDuration, 0.0f, 1.0f) + : 0.0f; + + // Next viseme for blending during the last 30% of each viseme + const int32 NextIdx = FMath::Min(VisemeIdx + 1, TextVisemeSequence.Num() - 1); + const FName NextViseme = TextVisemeSequence[NextIdx]; + + // Rebuild this frame: text-derived shape × stored amplitude + TMap& Frame = VisemeQueue[Idx]; + for (const FName& Name : VisemeNames) + { + Frame.FindOrAdd(Name) = 0.0f; + } + + if (TextViseme == FName("sil")) + { + // Text-driven silence — mouth closes + Frame.FindOrAdd(FName("sil")) = 1.0f; + } + else + { + // Anticipatory blending: in the last 30% of each viseme, + // gradually blend towards the next viseme shape. + const float BlendZone = 0.3f; + float BlendToNext = 0.0f; + if (LocalProgress > (1.0f - BlendZone) && NextViseme != TextViseme && NextViseme != FName("sil")) + { + BlendToNext = (LocalProgress - (1.0f - BlendZone)) / BlendZone; + } + + // Primary viseme shape × amplitude + Frame.FindOrAdd(TextViseme) += Amp * (1.0f - BlendToNext * 0.5f); + + // Blend towards next viseme + if (BlendToNext > 0.0f && NextViseme != FName("sil")) + { + Frame.FindOrAdd(NextViseme) += Amp * BlendToNext * 0.5f; + } + } + + ActiveIdx++; + } + + bTextVisemesApplied = true; + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Applied %d text visemes to %d active frames (of %d total)"), + TextVisemeSequence.Num(), ActiveFrames, VisemeQueue.Num()); } void UElevenLabsLipSyncComponent::AnalyzeSpectrum() @@ -478,14 +1427,9 @@ void UElevenLabsLipSyncComponent::AnalyzeSpectrum() const float TotalEnergy = VoiceEnergy + F1Energy + F2Energy + F3Energy + SibilantEnergy; - // DEBUG: log energy levels periodically - static int32 AnalysisCount = 0; - if (++AnalysisCount % 50 == 1) // Log every ~50 analyses - { - UE_LOG(LogElevenLabsLipSync, Log, - TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"), - TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy); - } + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"), + TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy); EstimateVisemes(TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy); } @@ -519,131 +1463,103 @@ void UElevenLabsLipSyncComponent::EstimateVisemes(float TotalEnergy, TargetVisemes.FindOrAdd(Name) = 0.0f; } - // Silence threshold — below this, mouth is closed - constexpr float SilenceThreshold = 0.002f; - - if (TotalEnergy < SilenceThreshold) + // Below noise floor → silence shape + if (TotalEnergy < 0.01f) { TargetVisemes.FindOrAdd(FName("sil")) = 1.0f; return; } - // Normalize band energies relative to total + // ── Spectral ratios determine mouth SHAPE (not intensity) ──────────── + // These weights are ~1.0 (full strength). Per-window amplitude in + // OnAudioChunkReceived scales them to create speech dynamics. + // This function produces a "shape template" for the entire audio chunk. const float InvTotal = 1.0f / FMath::Max(TotalEnergy, 0.0001f); const float NormF1 = F1Energy * InvTotal; const float NormF2 = F2Energy * InvTotal; const float NormF3 = F3Energy * InvTotal; const float NormSibilant = SibilantEnergy * InvTotal; - // Energy-based intensity (how "loud" the speech is — drives overall jaw opening) - // Scale to a usable 0-1 range. The constant is empirically tuned. - const float Intensity = FMath::Clamp(TotalEnergy * 25.0f, 0.0f, 1.0f); + // Brightness: ratio of high-freq to low-freq energy. + // Low brightness = rounded lips (oh, ou). High = spread lips (E, ih, SS). + const float Brightness = FMath::Clamp( + (NormF2 + NormF3 + NormSibilant * 2.0f) / FMath::Max(NormF1 + 0.01f, 0.01f), + 0.0f, 4.0f) / 4.0f; - // ── Classification based on spectral shape ─────────────────────────────── - // The approach: compute "votes" for each viseme category based on where - // the spectral energy is concentrated. Multiple visemes can be active - // simultaneously (blended). + // ── Primary vowel/consonant shape (mutually exclusive) ─────────────── - // Fricatives / sibilants: high-frequency energy dominates - if (NormSibilant > 0.25f) + if (NormSibilant > 0.2f) { - const float FricativeWeight = NormSibilant * Intensity; - // Distinguish S/Z (narrow, higher freq) from SH/CH (broader, lower freq) + // Sibilant chunk — per-window ZCR will refine this further + float SibWeight = FMath::Clamp(NormSibilant * 2.0f, 0.0f, 1.0f); if (NormF3 > NormF2) { - TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight; + TargetVisemes.FindOrAdd(FName("SS")) = SibWeight; } else { - TargetVisemes.FindOrAdd(FName("CH")) = FricativeWeight * 0.7f; - TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight * 0.3f; + TargetVisemes.FindOrAdd(FName("CH")) = SibWeight * 0.7f; + TargetVisemes.FindOrAdd(FName("SS")) = SibWeight * 0.3f; } - // F/V component - TargetVisemes.FindOrAdd(FName("FF")) = FricativeWeight * 0.3f; + TargetVisemes.FindOrAdd(FName("FF")) = SibWeight * 0.3f; } - - // Voiced speech: most energy in voice + F1 + F2 - if (NormSibilant < 0.5f) + else if (Brightness > 0.55f) { - const float VoicedWeight = (1.0f - NormSibilant) * Intensity; - - // Open vowels: strong F1 = wide jaw opening - if (NormF1 > 0.3f) + // Bright (front vowel): E or ih — spread lips + if (NormF1 > 0.2f) { - if (NormF2 > 0.35f) - { - // High F2 + high F1 → front open vowel (A as in "cat") - TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1; - } - else - { - // Low F2 + high F1 → back open vowel (O as in "go") - TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * NormF1 * 0.7f; - TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1 * 0.3f; - } + TargetVisemes.FindOrAdd(FName("E")) = 1.0f; + TargetVisemes.FindOrAdd(FName("aa")) = 0.3f; } - - // Mid vowels: moderate F1 - if (NormF1 > 0.15f && NormF1 <= 0.3f) + else { - if (NormF2 > 0.4f) - { - // High F2 → front mid vowel (E as in "bed") - TargetVisemes.FindOrAdd(FName("E")) = VoicedWeight * 0.7f; - } - else - { - // Low F2 → rounded mid vowel - TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * 0.5f; - } - } - - // Close vowels: weak F1 - if (NormF1 <= 0.15f && NormF2 > 0.0f) - { - if (NormF2 > 0.4f) - { - // High F2 → front close vowel (I as in "see") - TargetVisemes.FindOrAdd(FName("ih")) = VoicedWeight * 0.6f; - } - else - { - // Low F2 → back close vowel (OO as in "boot") - TargetVisemes.FindOrAdd(FName("ou")) = VoicedWeight * 0.6f; - } - } - - // Nasals / liquids: prominent F3 with low sibilant - if (NormF3 > 0.2f && NormSibilant < 0.15f) - { - if (NormF1 < 0.2f) - { - TargetVisemes.FindOrAdd(FName("nn")) = VoicedWeight * 0.4f; - } - else - { - TargetVisemes.FindOrAdd(FName("RR")) = VoicedWeight * 0.3f; - } - } - - // Plosive detection: very low F1 with moderate energy = lips/tongue closed - if (NormF1 < 0.1f && Intensity > 0.3f && NormSibilant < 0.2f) - { - TargetVisemes.FindOrAdd(FName("PP")) = VoicedWeight * 0.3f; - TargetVisemes.FindOrAdd(FName("DD")) = VoicedWeight * 0.2f; + TargetVisemes.FindOrAdd(FName("ih")) = 0.8f; } } - - // TH detection: moderate sibilant + moderate F3 (dental fricative) - if (NormSibilant > 0.15f && NormSibilant < 0.35f && NormF3 > 0.15f) + else if (Brightness < 0.3f) { - TargetVisemes.FindOrAdd(FName("TH")) = Intensity * 0.3f; + // Dark (back vowel): oh or ou — rounded lips + if (NormF1 > 0.2f) + { + TargetVisemes.FindOrAdd(FName("oh")) = 1.0f; + } + else + { + TargetVisemes.FindOrAdd(FName("ou")) = 0.8f; + } + } + else + { + // Neutral / open vowel: aa — wide open jaw + TargetVisemes.FindOrAdd(FName("aa")) = 1.0f; } - // Ensure at least some silence weight when energy is very low - if (Intensity < 0.1f) + // ── Secondary consonant contributions (additive) ───────────────────── + + // Nasals (N, M, NG): prominent F3, low sibilant + if (NormF3 > 0.25f && NormSibilant < 0.15f) { - TargetVisemes.FindOrAdd(FName("sil")) = 1.0f - Intensity * 10.0f; + TargetVisemes.FindOrAdd(FName("nn")) = 0.5f; + TargetVisemes.FindOrAdd(FName("RR")) = 0.2f; + } + + // Plosive hint (P, B): very low F1 + if (NormF1 < 0.08f && NormSibilant < 0.2f) + { + TargetVisemes.FindOrAdd(FName("PP")) = 0.5f; + TargetVisemes.FindOrAdd(FName("DD")) = 0.3f; + } + + // Labiodental (F, V): moderate sibilant + lip involvement + if (NormSibilant > 0.12f && NormSibilant < 0.3f && NormF1 < 0.15f) + { + TargetVisemes.FindOrAdd(FName("FF")) = 0.6f; + } + + // Dental (TH): moderate sibilant + moderate F3 + if (NormSibilant > 0.12f && NormSibilant < 0.35f && NormF3 > 0.15f) + { + TargetVisemes.FindOrAdd(FName("TH")) = 0.4f; } } @@ -686,9 +1602,8 @@ void UElevenLabsLipSyncComponent::ApplyMorphTargets() { if (!TargetMesh) return; - // DEBUG: log blendshape values periodically static int32 ApplyCount = 0; - if (++ApplyCount % 120 == 1) // Log every ~2s at 60fps + if (++ApplyCount % 120 == 1) { FString DebugStr; for (const auto& Pair : CurrentBlendshapes) @@ -708,7 +1623,7 @@ void UElevenLabsLipSyncComponent::ApplyMorphTargets() } if (DebugStr.Len() > 0) { - UE_LOG(LogElevenLabsLipSync, Log, TEXT("%s: %s"), + UE_LOG(LogElevenLabsLipSync, Verbose, TEXT("%s: %s"), bUseCurveMode ? TEXT("Curves") : TEXT("Blendshapes"), *DebugStr); } } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index e5a5fe2..e3c1ced 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -136,6 +136,17 @@ public: meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text.")) bool bEnableAgentPartialResponse = false; + /** Pre-buffer delay (ms) before starting audio playback on the first chunk. + * ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them. + * Pre-buffering delays playback start so the second chunk arrives before the + * first finishes playing, eliminating the audible gap mid-sentence. + * Higher values = fewer gaps but more latency on the first word. + * Set to 0 for immediate playback (may cause mid-sentence pauses). */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency", + meta = (ClampMin = "0", ClampMax = "500", + ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness.")) + int32 AudioPreBufferMs = 250; + /** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */ UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", meta = (ClampMin = "0.0", @@ -257,6 +268,11 @@ public: UFUNCTION(BlueprintPure, Category = "ElevenLabs") const FElevenLabsConversationInfo& GetConversationInfo() const; + /** True while audio is being pre-buffered (playback hasn't started yet). + * Used by the LipSync component to pause viseme queue consumption. */ + UFUNCTION(BlueprintPure, Category = "ElevenLabs") + bool IsPreBuffering() const { return bPreBuffering; } + /** Access the underlying WebSocket proxy (advanced use). */ UFUNCTION(BlueprintPure, Category = "ElevenLabs") UElevenLabsWebSocketProxy* GetWebSocketProxy() const { return WebSocketProxy; } @@ -353,6 +369,14 @@ private: TArray AudioQueue; FCriticalSection AudioQueueLock; + // Reusable zero-filled buffer fed to USoundWaveProcedural during TTS gaps + // to keep the audio component alive (prevents stop on buffer underrun). + TArray SilenceBuffer; + + // Pre-buffer state: delay playback start to absorb TTS inter-chunk gaps. + bool bPreBuffering = false; + double PreBufferStartTime = 0.0; + // Silence detection: how many consecutive ticks with an empty audio queue. int32 SilentTickCount = 0; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h index 9a98e1d..abaf230 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h @@ -51,11 +51,19 @@ public: ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler.")) float LipSyncStrength = 1.0f; + /** Scales the audio amplitude driving mouth movement. + * Lower values produce subtler animation, higher values are more pronounced. + * Use this to tone down overly strong lip movement without changing the shape. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync", + meta = (ClampMin = "0.5", ClampMax = "1.0", + ToolTip = "Audio amplitude scale.\n0.5 = subtle, 0.75 = balanced, 1.0 = full.\nReduces overall mouth movement without affecting viseme shape.")) + float AmplitudeScale = 0.75f; + /** How quickly viseme weights interpolate towards new values each frame. */ UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync", - meta = (ClampMin = "1.0", ClampMax = "100.0", - ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good.")) - float SmoothingSpeed = 20.0f; + meta = (ClampMin = "35.0", ClampMax = "65.0", + ToolTip = "Smoothing speed for viseme transitions.\n35 = smooth and soft, 50 = balanced, 65 = sharp and responsive.")) + float SmoothingSpeed = 50.0f; // ── Events ──────────────────────────────────────────────────────────────── @@ -87,6 +95,20 @@ private: /** Receives raw PCM from the agent component. */ void OnAudioChunkReceived(const TArray& PCMData); + /** Receives full text response from the agent component. */ + UFUNCTION() + void OnTextResponseReceived(const FString& ResponseText); + + /** Receives partial text streaming from the agent component. */ + UFUNCTION() + void OnPartialTextReceived(const FString& PartialText); + + /** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */ + void ConvertTextToVisemes(const FString& Text); + + /** Apply text-derived viseme shapes to the remaining queued frames. */ + void ApplyTextVisemesToQueue(); + /** Extract frequency band energies from the spectrum analyzer. */ void AnalyzeSpectrum(); @@ -122,6 +144,13 @@ private: // ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes) TMap CurrentBlendshapes; + // Previous frame's blendshape values for additional output smoothing + TMap PreviousBlendshapes; + + // Last consumed queue frame — used for inter-frame interpolation + // to create continuous motion instead of 32ms step-wise jumps + TMap LastConsumedVisemes; + // MetaHuman mode: Face mesh has no morph targets, use animation curves instead. // Set automatically in BeginPlay when TargetMesh has 0 morph targets. bool bUseCurveMode = false; @@ -129,9 +158,48 @@ private: // Cache of ARKit→MetaHuman curve name conversions to avoid per-frame string ops. TMap CurveNameCache; + // RMS amplitude from the latest audio chunk (0-1 range, drives jaw opening) + float CurrentAmplitude = 0.0f; + + // ── Viseme queue ────────────────────────────────────────────────────────── + + // Queue of per-window viseme analysis results. + // OnAudioChunkReceived builds one frame per 512-sample window (~32ms). + // TickComponent consumes them at the correct playback rate. + TArray> VisemeQueue; + + // Parallel queue of per-window amplitude values (for text-driven shape replacement) + TArray AmplitudeQueue; + + // Timer for consuming queued viseme frames at the FFT window rate + float PlaybackTimer = 0.0f; + // Whether we have pending analysis results to process bool bHasPendingAnalysis = false; + // ── Text-driven lip sync ────────────────────────────────────────────────── + + // Accumulated partial text from streaming (agent_chat_response_part events). + // Built up token-by-token before the audio arrives. + FString AccumulatedText; + + // Ordered sequence of OVR viseme names derived from text. + // E.g. "Bonjour" → [PP, oh, nn, CH, ou, RR] + TArray TextVisemeSequence; + + // Whether text-based visemes have been applied to the current queue + bool bTextVisemesApplied = false; + + // Set when agent_response arrives (full text for this utterance). + // Prevents resetting AccumulatedText between audio chunks of the + // SAME utterance — only reset once the full response is confirmed. + bool bFullTextReceived = false; + + // Wait-for-text mechanism: when audio arrives without text, hold playback + // until text arrives (partial or full) so all frames get proper text visemes. + bool bWaitingForText = false; + double WaitingForTextStartTime = 0.0; + // Cached reference to the agent component on the same Actor TWeakObjectPtr AgentComponent; FDelegateHandle AudioDataHandle;