From 5f0bd6e9f052821ab8d0d8394b9df4467176b6a0 Mon Sep 17 00:00:00 2001
From: KiritoDv <kiritodev01@gmail.com>
Date: Sun, 2 Feb 2025 23:34:11 -0600
Subject: [PATCH 1/6] First implementation of louist system

---
 .github/workflows/linux.yml                   |   2 +-
 .github/workflows/mac.yml                     |   2 +-
 .github/workflows/main.yml                    |   4 +-
 CMakeLists.txt                                |  26 +-
 assets/yaml/cn/rev0/ast_audio.yaml            |  31 +++
 cmake/modules/FindOgg.cmake                   |  61 +++++
 cmake/modules/FindVorbis.cmake                | 197 ++++++++++++++
 docs/BUILDING.md                              |  18 +-
 src/audio/audio_synthesis.c                   |   3 +-
 src/port/Engine.cpp                           |   5 +-
 .../importers/audio/SampleFactory.cpp         | 244 ++++++++++++++++--
 .../resource/importers/audio/SampleFactory.h  |  12 +-
 tools/Torch                                   |   2 +-
 13 files changed, 567 insertions(+), 40 deletions(-)
 create mode 100644 assets/yaml/cn/rev0/ast_audio.yaml
 create mode 100644 cmake/modules/FindOgg.cmake
 create mode 100644 cmake/modules/FindVorbis.cmake

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 251ddcc8..c0b68d29 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -14,7 +14,7 @@ jobs:
       - name: Update machine
         run: sudo apt update
       - name: Install dependencies
-        run: sudo apt-get install gcc g++ git cmake ninja-build lsb-release libsdl2-dev libpng-dev libsdl2-net-dev libzip-dev zipcmp zipmerge ziptool nlohmann-json3-dev libtinyxml2-dev libspdlog-dev libboost-dev libopengl-dev
+        run: sudo apt-get install gcc g++ git cmake ninja-build lsb-release libsdl2-dev libpng-dev libsdl2-net-dev libzip-dev zipcmp zipmerge ziptool nlohmann-json3-dev libtinyxml2-dev libspdlog-dev libboost-dev libopengl-dev libogg-dev libvorbis-dev
       - name: Install latest SDL
         run: |
           export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 2cc8c768..f70c52e2 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -12,7 +12,7 @@ jobs:
         with:
           submodules: recursive
       - name: Install dependencies
-        run: brew install sdl2 libpng glew ninja cmake libzip nlohmann-json tinyxml2 spdlog
+        run: brew install sdl2 libpng glew ninja cmake libzip nlohmann-json tinyxml2 spdlog vorbis-tools
       - name: Build
         run: |
           cmake -H. -Bbuild-cmake -GNinja -DCMAKE_BUILD_TYPE=Release
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 160e17e2..54ac983f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -81,7 +81,7 @@ jobs:
         with:
           submodules: recursive
       - name: Install dependencies
-        run: brew install sdl2 libpng glew ninja cmake libzip nlohmann-json tinyxml2 spdlog
+        run: brew install sdl2 libpng glew ninja cmake libzip nlohmann-json tinyxml2 spdlog vorbis-tools
       - name: Build
         run: |
           cmake -H. -Bbuild-cmake -GNinja -DCMAKE_BUILD_TYPE=Release
@@ -115,7 +115,7 @@ jobs:
       - name: Update machine
         run: sudo apt update
       - name: Install dependencies
-        run: sudo apt-get install gcc g++ git cmake ninja-build lsb-release libsdl2-dev libpng-dev libsdl2-net-dev libzip-dev zipcmp zipmerge ziptool nlohmann-json3-dev libtinyxml2-dev libspdlog-dev libboost-dev libopengl-dev
+        run: sudo apt-get install gcc g++ git cmake ninja-build lsb-release libsdl2-dev libpng-dev libsdl2-net-dev libzip-dev zipcmp zipmerge ziptool nlohmann-json3-dev libtinyxml2-dev libspdlog-dev libboost-dev libopengl-dev libogg-dev libvorbis-dev
       - name: ccache
         uses: hendrikmuhs/ccache-action@v1.2.14
         with:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 557a23c4..ffe5a58f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.16.0 FATAL_ERROR)
 
 # Set the project version and language
 project(Starship VERSION 0.1.0 LANGUAGES C CXX ASM)
+include(FetchContent)
 
 if(APPLE)
   enable_language(OBJCXX)
@@ -26,7 +27,7 @@ include(cmake/automate-vcpkg.cmake)
 set(VCPKG_TRIPLET x64-windows-static)
 set(VCPKG_TARGET_TRIPLET x64-windows-static)
 vcpkg_bootstrap()
-vcpkg_install_packages(zlib bzip2 libzip libpng sdl2 glew glfw3 nlohmann-json tinyxml2 spdlog)
+vcpkg_install_packages(zlib bzip2 libzip libpng sdl2 glew glfw3 nlohmann-json tinyxml2 spdlog libogg libvorbis)
 
 set_property(DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${PROJECT_NAME})
 set_property(DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY VS_DEBUGGER_WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
@@ -188,6 +189,13 @@ if (MSVC)
   endif()
 endif()
 
+FetchContent_Declare(
+    dr_libs
+    GIT_REPOSITORY https://github.com/mackron/dr_libs.git
+	GIT_TAG da35f9d6c7374a95353fd1df1d394d44ab66cf01
+)
+FetchContent_MakeAvailable(dr_libs)
+
 #==============================================================================#
 # Libultraship Integration                                                     #
 #==============================================================================#
@@ -224,6 +232,7 @@ include_directories(
   ${CMAKE_CURRENT_SOURCE_DIR}/libultraship/src/graphic
   ${SDL2_INCLUDE_DIRS}
   ${GLEW_INCLUDE_DIRS}
+  ${dr_libs_SOURCE_DIR}
 )
 
 add_subdirectory(libultraship ${CMAKE_CURRENT_SOURCE_DIR}/libultraship)
@@ -280,8 +289,17 @@ endif()
 
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Windows")
+    find_package(Ogg CONFIG REQUIRED)
+    link_libraries(Ogg::ogg)
+
+    find_package(Vorbis CONFIG REQUIRED)
+    link_libraries(Vorbis::vorbisfile)
 	set(ADDITIONAL_LIBRARY_DEPENDENCIES
         "$<$<BOOL:${USE_NETWORKING}>:SDL2_net::SDL2_net-static>"
+        "Ogg::ogg"
+        "Vorbis::vorbis"
+        "Vorbis::vorbisenc"
+        "Vorbis::vorbisfile"
     )
 elseif(CMAKE_SYSTEM_NAME STREQUAL "NintendoSwitch")
     set(ADDITIONAL_LIBRARY_DEPENDENCIES
@@ -295,8 +313,14 @@ elseif(CMAKE_SYSTEM_NAME STREQUAL "CafeOS")
         ${DEVKITPRO}/portlibs/wiiu/include/
     )
 else()
+    find_package(Ogg REQUIRED)
+    find_package(Vorbis REQUIRED)
  	set(ADDITIONAL_LIBRARY_DEPENDENCIES
         "$<$<BOOL:${USE_NETWORKING}>:SDL2_net::SDL2_net>"
+        "Ogg::ogg"
+        "Vorbis::vorbis"
+        "Vorbis::vorbisenc"
+        "Vorbis::vorbisfile"
 	)
 endif()
 
diff --git a/assets/yaml/cn/rev0/ast_audio.yaml b/assets/yaml/cn/rev0/ast_audio.yaml
new file mode 100644
index 00000000..e6bb775a
--- /dev/null
+++ b/assets/yaml/cn/rev0/ast_audio.yaml
@@ -0,0 +1,31 @@
+:config:
+  force: true
+  header:
+    code:
+      - '#include "sys.h"'
+      - '#include "sf64audio_provisional.h"'
+
+audio_setup:
+  type: NAUDIO:V1:AUDIO_SETUP
+  driver: SF64
+  audio_seq:
+    size: 0x3AFD0
+    offset: 0xE9950
+  audio_bank:
+    size: 0x1CB20
+    offset: 0x1183A0
+  audio_table:
+    size: 0x691AF0
+    offset: 0x134EC0
+
+audio_sample_bank_table:
+  { type: NAUDIO:V1:AUDIO_TABLE, format: SAMPLE, offset: 0xC1460, symbol: gSampleBankTableInit }
+
+audio_seq_table:
+  { type: NAUDIO:V1:AUDIO_TABLE, format: SEQUENCE, offset: 0xC14A0, symbol: gSeqTableInit }
+
+audio_soundfont_table:
+  { type: NAUDIO:V1:AUDIO_TABLE, format: SOUNDFONT, offset: 0xC18D0, symbol: gSoundFontTableInit }
+
+audio_seq_font_table:
+  { type: ARRAY, count: 283, array_type: u8, offset: 0xC1AF0, symbol: gSeqFontTableInit }
\ No newline at end of file
diff --git a/cmake/modules/FindOgg.cmake b/cmake/modules/FindOgg.cmake
new file mode 100644
index 00000000..f606144f
--- /dev/null
+++ b/cmake/modules/FindOgg.cmake
@@ -0,0 +1,61 @@
+# - Find ogg
+# Find the native ogg includes and libraries
+#
+#  OGG_INCLUDE_DIRS - where to find ogg.h, etc.
+#  OGG_LIBRARIES    - List of libraries when using ogg.
+#  OGG_FOUND        - True if ogg found.
+
+if (OGG_INCLUDE_DIR)
+	# Already in cache, be silent
+	set(OGG_FIND_QUIETLY TRUE)
+endif ()
+
+find_package (PkgConfig QUIET)
+pkg_check_modules (PC_OGG QUIET ogg>=1.3.0)
+
+set (OGG_VERSION ${PC_OGG_VERSION})
+
+find_path (OGG_INCLUDE_DIR ogg/ogg.h
+	HINTS
+		${PC_OGG_INCLUDEDIR}
+		${PC_OGG_INCLUDE_DIRS}
+		${OGG_ROOT}
+	)
+# MSVC built ogg may be named ogg_static.
+# The provided project files name the library with the lib prefix.
+find_library (OGG_LIBRARY
+	NAMES
+		ogg
+		ogg_static
+		libogg
+		libogg_static
+	HINTS
+		${PC_OGG_LIBDIR}
+		${PC_OGG_LIBRARY_DIRS}
+		${OGG_ROOT}
+	)
+# Handle the QUIETLY and REQUIRED arguments and set OGG_FOUND
+# to TRUE if all listed variables are TRUE.
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (Ogg
+	REQUIRED_VARS
+		OGG_LIBRARY
+		OGG_INCLUDE_DIR
+	VERSION_VAR
+		OGG_VERSION
+	)
+
+if (OGG_FOUND)
+	set (OGG_LIBRARIES ${OGG_LIBRARY})
+	set (OGG_INCLUDE_DIRS ${OGG_INCLUDE_DIR})
+
+	if(NOT TARGET Ogg::ogg)
+	add_library(Ogg::ogg UNKNOWN IMPORTED)
+		set_target_properties(Ogg::ogg PROPERTIES
+			INTERFACE_INCLUDE_DIRECTORIES "${OGG_INCLUDE_DIRS}"
+			IMPORTED_LOCATION "${OGG_LIBRARIES}"
+		)
+  endif ()
+endif ()
+
+mark_as_advanced (OGG_INCLUDE_DIR OGG_LIBRARY)
\ No newline at end of file
diff --git a/cmake/modules/FindVorbis.cmake b/cmake/modules/FindVorbis.cmake
new file mode 100644
index 00000000..0d3d6624
--- /dev/null
+++ b/cmake/modules/FindVorbis.cmake
@@ -0,0 +1,197 @@
+#[=======================================================================[.rst:
+FindVorbis
+----------
+Finds the native vorbis, vorbisenc amd vorbisfile includes and libraries.
+Imported Targets
+^^^^^^^^^^^^^^^^
+This module provides the following imported targets, if found:
+``Vorbis::vorbis``
+  The Vorbis library
+``Vorbis::vorbisenc``
+  The VorbisEnc library
+``Vorbis::vorbisfile``
+  The VorbisFile library
+Result Variables
+^^^^^^^^^^^^^^^^
+This will define the following variables:
+``Vorbis_Vorbis_INCLUDE_DIRS``
+  List of include directories when using vorbis.
+``Vorbis_Enc_INCLUDE_DIRS``
+  List of include directories when using vorbisenc.
+``Vorbis_File_INCLUDE_DIRS``
+  List of include directories when using vorbisfile.
+``Vorbis_Vorbis_LIBRARIES``
+  List of libraries when using vorbis.
+``Vorbis_Enc_LIBRARIES``
+  List of libraries when using vorbisenc.
+``Vorbis_File_LIBRARIES``
+  List of libraries when using vorbisfile.
+``Vorbis_FOUND``
+  True if vorbis and requested components found.
+``Vorbis_Vorbis_FOUND``
+  True if vorbis found.
+``Vorbis_Enc_FOUND``
+  True if vorbisenc found.
+``Vorbis_Enc_FOUND``
+  True if vorbisfile found.
+Cache variables
+^^^^^^^^^^^^^^^
+The following cache variables may also be set:
+``Vorbis_Vorbis_INCLUDE_DIR``
+  The directory containing ``vorbis/vorbis.h``.
+``Vorbis_Enc_INCLUDE_DIR``
+  The directory containing ``vorbis/vorbisenc.h``.
+``Vorbis_File_INCLUDE_DIR``
+  The directory containing ``vorbis/vorbisenc.h``.
+``Vorbis_Vorbis_LIBRARY``
+  The path to the vorbis library.
+``Vorbis_Enc_LIBRARY``
+  The path to the vorbisenc library.
+``Vorbis_File_LIBRARY``
+  The path to the vorbisfile library.
+Hints
+^^^^^
+A user may set ``Vorbis_ROOT`` to a vorbis installation root to tell this module where to look.
+#]=======================================================================]
+
+if (Vorbis_Vorbis_INCLUDE_DIR)
+	# Already in cache, be silent
+	set (Vorbis_FIND_QUIETLY TRUE)
+endif ()
+
+set (Vorbis_Vorbis_FIND_QUIETLY TRUE)
+set (Vorbis_Enc_FIND_QUIETLY TRUE)
+set (Vorbis_File_FIND_QUIETLY TRUE)
+
+find_package (Ogg QUIET)
+
+find_package (PkgConfig QUIET)
+pkg_check_modules (PC_Vorbis_Vorbis QUIET vorbis)
+pkg_check_modules (PC_Vorbis_Enc QUIET vorbisenc)
+pkg_check_modules (PC_Vorbis_File QUIET vorbisfile)
+
+set (Vorbis_VERSION ${PC_Vorbis_Vorbis_VERSION})
+
+find_path (Vorbis_Vorbis_INCLUDE_DIR vorbis/codec.h
+	HINTS
+		${PC_Vorbis_Vorbis_INCLUDEDIR}
+		${PC_Vorbis_Vorbis_INCLUDE_DIRS}
+		${Vorbis_ROOT}
+	)
+
+find_path (Vorbis_Enc_INCLUDE_DIR vorbis/vorbisenc.h
+	HINTS
+		${PC_Vorbis_Enc_INCLUDEDIR}
+		${PC_Vorbis_Enc_INCLUDE_DIRS}
+		${Vorbis_ROOT}
+	)
+
+find_path (Vorbis_File_INCLUDE_DIR vorbis/vorbisfile.h
+	HINTS
+		${PC_Vorbis_File_INCLUDEDIR}
+		${PC_Vorbis_File_INCLUDE_DIRS}
+		${Vorbis_ROOT}
+	)
+
+find_library (Vorbis_Vorbis_LIBRARY
+	NAMES
+		vorbis
+		vorbis_static
+		libvorbis
+		libvorbis_static
+	HINTS
+		${PC_Vorbis_Vorbis_LIBDIR}
+		${PC_Vorbis_Vorbis_LIBRARY_DIRS}
+		${Vorbis_ROOT}
+	)
+
+find_library (Vorbis_Enc_LIBRARY
+	NAMES
+		vorbisenc
+		vorbisenc_static
+		libvorbisenc
+		libvorbisenc_static
+	HINTS
+		${PC_Vorbis_Enc_LIBDIR}
+		${PC_Vorbis_Enc_LIBRARY_DIRS}
+		${Vorbis_ROOT}
+	)
+
+find_library (Vorbis_File_LIBRARY
+	NAMES
+		vorbisfile
+		vorbisfile_static
+		libvorbisfile
+		libvorbisfile_static
+	HINTS
+		${PC_Vorbis_File_LIBDIR}
+		${PC_Vorbis_File_LIBRARY_DIRS}
+		${Vorbis_ROOT}
+	)
+
+include (FindPackageHandleStandardArgs)
+
+if (Vorbis_Vorbis_LIBRARY AND Vorbis_Vorbis_INCLUDE_DIR AND Ogg_FOUND)
+    set (Vorbis_Vorbis_FOUND TRUE)
+endif ()
+
+if (Vorbis_Enc_LIBRARY AND Vorbis_Enc_INCLUDE_DIR AND Vorbis_Vorbis_FOUND)
+    set (Vorbis_Enc_FOUND TRUE)
+endif ()
+
+if (Vorbis_Vorbis_FOUND AND Vorbis_File_LIBRARY AND Vorbis_File_INCLUDE_DIR)
+    set (Vorbis_File_FOUND TRUE)
+endif ()
+
+find_package_handle_standard_args (Vorbis
+	REQUIRED_VARS
+		Vorbis_Vorbis_LIBRARY
+		Vorbis_Vorbis_INCLUDE_DIR
+		Ogg_FOUND
+	HANDLE_COMPONENTS
+	VERSION_VAR Vorbis_VERSION)
+
+
+if (Vorbis_Vorbis_FOUND)
+	set (Vorbis_Vorbis_INCLUDE_DIRS ${VORBIS_INCLUDE_DIR})
+	set (Vorbis_Vorbis_LIBRARIES ${VORBIS_LIBRARY} ${OGG_LIBRARIES})
+    if (NOT TARGET Vorbis::vorbis)
+		add_library (Vorbis::vorbis UNKNOWN IMPORTED)
+		set_target_properties (Vorbis::vorbis PROPERTIES
+			INTERFACE_INCLUDE_DIRECTORIES "${Vorbis_Vorbis_INCLUDE_DIR}"
+			IMPORTED_LOCATION "${Vorbis_Vorbis_LIBRARY}"
+			INTERFACE_LINK_LIBRARIES Ogg::ogg
+		)
+	endif ()
+
+	if (Vorbis_Enc_FOUND)
+		set (Vorbis_Enc_INCLUDE_DIRS ${Vorbis_Enc_INCLUDE_DIR})
+		set (Vorbis_Enc_LIBRARIES ${Vorbis_Enc_LIBRARY} ${Vorbis_Enc_LIBRARIES})
+		if (NOT TARGET Vorbis::vorbisenc)
+			add_library (Vorbis::vorbisenc UNKNOWN IMPORTED)
+			set_target_properties (Vorbis::vorbisenc PROPERTIES
+				INTERFACE_INCLUDE_DIRECTORIES "${Vorbis_Enc_INCLUDE_DIR}"
+				IMPORTED_LOCATION "${Vorbis_Enc_LIBRARY}"
+				INTERFACE_LINK_LIBRARIES Vorbis::vorbis
+			)
+		endif ()
+	endif ()
+
+	if (Vorbis_File_FOUND)
+		set (Vorbis_File_INCLUDE_DIRS ${Vorbis_File_INCLUDE_DIR})
+		set (Vorbis_File_LIBRARIES ${Vorbis_File_LIBRARY} ${Vorbis_File_LIBRARIES})
+		if (NOT TARGET Vorbis::vorbisfile)
+			add_library (Vorbis::vorbisfile UNKNOWN IMPORTED)
+			set_target_properties (Vorbis::vorbisfile PROPERTIES
+				INTERFACE_INCLUDE_DIRECTORIES "${Vorbis_File_INCLUDE_DIR}"
+				IMPORTED_LOCATION "${Vorbis_File_LIBRARY}"
+				INTERFACE_LINK_LIBRARIES Vorbis::vorbis
+			)
+		endif ()
+	endif ()
+
+endif ()
+
+mark_as_advanced (Vorbis_Vorbis_INCLUDE_DIR Vorbis_Vorbis_LIBRARY)
+mark_as_advanced (Vorbis_Enc_INCLUDE_DIR Vorbis_Enc_LIBRARY)
+mark_as_advanced (Vorbis_File_INCLUDE_DIR Vorbis_File_LIBRARY)
\ No newline at end of file
diff --git a/docs/BUILDING.md b/docs/BUILDING.md
index d042a25c..aac581e6 100644
--- a/docs/BUILDING.md
+++ b/docs/BUILDING.md
@@ -83,34 +83,34 @@ C:\Program Files\CMake\bin\cmake.exe --build build-cmake --target clean
 #### Debian/Ubuntu
 ```sh
 # using gcc
-apt-get install gcc g++ git cmake ninja-build lsb-release libsdl2-dev libpng-dev libsdl2-net-dev libzip-dev zipcmp zipmerge ziptool nlohmann-json3-dev libtinyxml2-dev libspdlog-dev libboost-dev libopengl-dev
+apt-get install gcc g++ git cmake ninja-build lsb-release libsdl2-dev libpng-dev libsdl2-net-dev libzip-dev zipcmp zipmerge ziptool nlohmann-json3-dev libtinyxml2-dev libspdlog-dev libboost-dev libopengl-dev libogg-dev ibvorbis-dev
 
 # or using clang
-apt-get install clang git cmake ninja-build lsb-release libsdl2-dev libpng-dev libsdl2-net-dev libzip-dev zipcmp zipmerge ziptool nlohmann-json3-dev libtinyxml2-dev libspdlog-dev libboost-dev libopengl-dev
+apt-get install clang git cmake ninja-build lsb-release libsdl2-dev libpng-dev libsdl2-net-dev libzip-dev zipcmp zipmerge ziptool nlohmann-json3-dev libtinyxml2-dev libspdlog-dev libboost-dev libopengl-dev libogg-dev libvorbis-dev
 ```
 #### Arch
 ```sh
 # using gcc
-pacman -S gcc git cmake ninja lsb-release sdl2 libpng libzip nlohmann-json tinyxml2 spdlog sdl2_net boost
+pacman -S gcc git cmake ninja lsb-release sdl2 libpng libzip nlohmann-json tinyxml2 spdlog sdl2_net boost libogg libvorbis
 
 # or using clang
-pacman -S clang git cmake ninja lsb-release sdl2 libpng libzip nlohmann-json tinyxml2 spdlog sdl2_net boost
+pacman -S clang git cmake ninja lsb-release sdl2 libpng libzip nlohmann-json tinyxml2 spdlog sdl2_net boost libogg libvorbis
 ```
 #### Fedora
 ```sh
 # using gcc
-dnf install gcc gcc-c++ git cmake ninja-build lsb_release SDL2-devel libpng-devel libzip-devel libzip-tools nlohmann-json-devel tinyxml2-devel spdlog-devel boost-devel
+dnf install gcc gcc-c++ git cmake ninja-build lsb_release SDL2-devel libpng-devel libzip-devel libzip-tools nlohmann-json-devel tinyxml2-devel spdlog-devel boost-devel libogg-devel libvorbis-devel
 
 # or using clang
-dnf install clang git cmake ninja-build lsb_release SDL2-devel libpng-devel libzip-devel libzip-tools nlohmann-json-devel tinyxml2-devel spdlog-devel boost-devel
+dnf install clang git cmake ninja-build lsb_release SDL2-devel libpng-devel libzip-devel libzip-tools nlohmann-json-devel tinyxml2-devel spdlog-devel boost-devel libogg-devel libvorbis-devel
 ```
 #### openSUSE
 ```sh
 # using gcc
-zypper in gcc gcc-c++ git cmake ninja SDL2-devel libpng16-devel libzip-devel libzip-tools nlohmann_json-devel tinyxml2-devel spdlog-devel
+zypper in gcc gcc-c++ git cmake ninja SDL2-devel libpng16-devel libzip-devel libzip-tools nlohmann_json-devel tinyxml2-devel spdlog-devel libogg-devel libvorbis-devel
 
 # or using clang
-zypper in clang libstdc++-devel git cmake ninja SDL2-devel libpng16-devel libzip-devel libzip-tools nlohmann_json-devel tinyxml2-devel spdlog-devel
+zypper in clang libstdc++-devel git cmake ninja SDL2-devel libpng16-devel libzip-devel libzip-tools nlohmann_json-devel tinyxml2-devel spdlog-devel libogg-devel libvorbis-devel
 ```
 
 ### Build
@@ -160,7 +160,7 @@ cmake --build build-cmake --target clean
 ```
 
 ## macOS
-Requires Xcode (or xcode-tools) && `sdl2, libpng, glew, ninja, cmake, nlohmann-json, libzip` (can be installed via homebrew, macports, etc)
+Requires Xcode (or xcode-tools) && `sdl2, libpng, glew, ninja, cmake, nlohmann-json, libzip, vorbis-tools` (can be installed via homebrew, macports, etc)
 
 **Important: For maximum performance make sure you have ninja build tools installed!**
 
diff --git a/src/audio/audio_synthesis.c b/src/audio/audio_synthesis.c
index 090e5128..f360229a 100644
--- a/src/audio/audio_synthesis.c
+++ b/src/audio/audio_synthesis.c
@@ -1036,13 +1036,14 @@ Acmd* AudioSynth_ProcessNote(s32 noteIndex, NoteSubEu* noteSub, NoteSynthesisSta
                         goto skip;
 
                     case CODEC_S16:
+                        flags = A_CONTINUE;
                         skipBytes = 0;
                         size_t bytesToRead;
                         numSamplesProcessed += numSamplesToLoadAdj;
                         dmemUncompressedAddrOffset1 = numSamplesToLoadAdj;
 
                         if (((synthState->samplePosInt * 2) + (numSamplesToLoadAdj)*SAMPLE_SIZE) < bookSample->size) {
-                            bytesToRead = (numSamplesToLoadAdj)*SAMPLE_SIZE;
+                            bytesToRead = (numSamplesToLoadAdj) * SAMPLE_SIZE;
                         } else {
                             bytesToRead = bookSample->size - (synthState->samplePosInt * 2);
                         }
diff --git a/src/port/Engine.cpp b/src/port/Engine.cpp
index 98492a67..aaebe67a 100644
--- a/src/port/Engine.cpp
+++ b/src/port/Engine.cpp
@@ -237,8 +237,9 @@ GameEngine::GameEngine() {
 
     loader->RegisterResourceFactory(std::make_shared<SF64::ResourceFactoryBinarySampleV1>(), RESOURCE_FORMAT_BINARY,
                                     "Sample", static_cast<uint32_t>(SF64::ResourceType::Sample), 1);
-    loader->RegisterResourceFactory(std::make_shared<SF64::ResourceFactoryBinarySampleV2>(), RESOURCE_FORMAT_BINARY,
-                                    "Sample", static_cast<uint32_t>(SF64::ResourceType::Sample), 2);
+    
+    loader->RegisterResourceFactory(std::make_shared<SF64::ResourceFactoryXMLSampleV0>(), RESOURCE_FORMAT_XML,
+                                    "Sample", static_cast<uint32_t>(SF64::ResourceType::Sample), 0);
 
     loader->RegisterResourceFactory(std::make_shared<SF64::ResourceFactoryBinarySoundFontV0>(), RESOURCE_FORMAT_BINARY,
                                     "SoundFont", static_cast<uint32_t>(SF64::ResourceType::SoundFont), 0);
diff --git a/src/port/resource/importers/audio/SampleFactory.cpp b/src/port/resource/importers/audio/SampleFactory.cpp
index cac380d2..8289266a 100644
--- a/src/port/resource/importers/audio/SampleFactory.cpp
+++ b/src/port/resource/importers/audio/SampleFactory.cpp
@@ -1,6 +1,14 @@
 #include "SampleFactory.h"
 #include "../ResourceUtil.h"
 #include "port/resource/type/audio/Sample.h"
+#include "sf64audio_provisional.h"
+#define DR_WAV_IMPLEMENTATION
+#include <dr_wav.h>
+
+#define DR_MP3_IMPLEMENTATION
+#include <dr_mp3.h>
+
+#include "vorbis/vorbisfile.h"
 
 namespace SF64 {
 std::shared_ptr<Ship::IResource> ResourceFactoryBinarySampleV1::ReadResource(std::shared_ptr<Ship::File> file) {
@@ -23,7 +31,7 @@ std::shared_ptr<Ship::IResource> ResourceFactoryBinarySampleV1::ReadResource(std
     if(sample->mSample.codec == 2){
         sample->mSample.medium = 2;
         for(size_t i = 0; i < sample->mSample.size / 2; i++){
-            int16_t* sampleData = (int16_t*) sample->mSample.sampleAddr;
+            auto sampleData = (int16_t*) sample->mSample.sampleAddr;
             sampleData[i] = BSWAP16(sampleData[i]);
         }
     } else {
@@ -35,36 +43,230 @@ std::shared_ptr<Ship::IResource> ResourceFactoryBinarySampleV1::ReadResource(std
     return sample;
 }
 
-std::shared_ptr<Ship::IResource> ResourceFactoryBinarySampleV2::ReadResource(std::shared_ptr<Ship::File> file) {
+static size_t VorbisReadCallback(void* out, size_t size, size_t elems, void* src) {
+    OggFileData* data = static_cast<OggFileData*>(src);
+    size_t toRead = size * elems;
+
+    if (toRead > data->size - data->pos) {
+        toRead = data->size - data->pos;
+    }
+
+    memcpy(out, static_cast<uint8_t*>(data->data) + data->pos, toRead);
+    data->pos += toRead;
+
+    return toRead / size;
+}
+
+static int VorbisSeekCallback(void* src, ogg_int64_t pos, int whence) {
+    OggFileData* data = static_cast<OggFileData*>(src);
+    size_t newPos;
+
+    switch (whence) {
+        case SEEK_SET:
+            newPos = pos;
+            break;
+        case SEEK_CUR:
+            newPos = data->pos + pos;
+            break;
+        case SEEK_END:
+            newPos = data->size + pos;
+            break;
+        default:
+            return -1;
+    }
+    if (newPos > data->size) {
+        return -1;
+    }
+    data->pos = newPos;
+    return 0;
+}
+
+static int VorbisCloseCallback([[maybe_unused]] void* src) {
+    return 0;
+}
+
+static long VorbisTellCallback(void* src) {
+    OggFileData* data = static_cast<OggFileData*>(src);
+    return data->pos;
+}
+
+static const ov_callbacks vorbisCallbacks = {
+    VorbisReadCallback,
+    VorbisSeekCallback,
+    VorbisCloseCallback,
+    VorbisTellCallback,
+};
+
+static void Mp3DecoderWorker(std::shared_ptr<Sample> sample, std::shared_ptr<Ship::File> sampleFile) {
+    drmp3 mp3;
+    drwav_uint64 numFrames;
+    drmp3_bool32 ret =
+        drmp3_init_memory(&mp3, sampleFile->Buffer.get()->data(), sampleFile->Buffer.get()->size(), nullptr);
+    numFrames = drmp3_get_pcm_frame_count(&mp3);
+    drwav_uint64 channels = mp3.channels;
+    drwav_uint64 sampleRate = mp3.sampleRate;
+
+    sample->mSample.sampleAddr = new uint8_t[numFrames * channels * 2];
+    drmp3_read_pcm_frames_s16(&mp3, numFrames, (int16_t*)sample->mSample.sampleAddr);
+}
+
+static void OggDecoderWorker(std::shared_ptr<Sample> sample, std::shared_ptr<Ship::File> sampleFile) {
+    OggVorbis_File vf;
+    char dataBuff[4096];
+    long read = 0;
+    size_t pos = 0;
+
+    OggFileData fileData = {
+        .data = sampleFile->Buffer.get()->data(),
+        .pos = 0,
+        .size = sampleFile->Buffer.get()->size(),
+    };
+    int ret = ov_open_callbacks(&fileData, &vf, nullptr, 0, vorbisCallbacks);
+
+    vorbis_info* vi = ov_info(&vf, -1);
+
+    uint64_t numFrames = ov_pcm_total(&vf, -1);
+    uint64_t sampleRate = vi->rate;
+    uint64_t numChannels = vi->channels;
+    int bitStream = 0;
+    size_t toRead = numFrames * numChannels * 2;
+    sample->mSample.sampleAddr = new uint8_t[toRead];
+    do {
+        read = ov_read(&vf, dataBuff, 4096, 0, 2, 1, &bitStream);
+        memcpy(sample->mSample.sampleAddr + pos, dataBuff, read);
+        pos += read;
+    } while (read != 0);
+    ov_clear(&vf);
+}
+
+std::shared_ptr<Ship::IResource> ResourceFactoryXMLSampleV0::ReadResource(std::shared_ptr<Ship::File> file) {
     if (!FileHasValidFormatAndReader(file)) {
         return nullptr;
     }
 
     auto sample = std::make_shared<Sample>(file->InitData);
-    auto reader = std::get<std::shared_ptr<Ship::BinaryReader>>(file->Reader);
+    auto child = std::get<std::shared_ptr<tinyxml2::XMLDocument>>(file->Reader)->FirstChildElement();
+    std::shared_ptr<Ship::ResourceInitData> initData = std::make_shared<Ship::ResourceInitData>();
+    const char* customFormatStr = child->Attribute("CustomFormat");
+    memset(&sample->mSample, 0, sizeof(sample->mSample));
+    sample->mSample.isRelocated = 0;
+    sample->mSample.codec = CodecStrToInt(child->Attribute("Codec"), file->InitData->Path.c_str());
+    sample->mSample.medium = MediumStrToInt(child->Attribute("Medium"));
+    sample->mSample.unk = child->IntAttribute("bit26");
 
-    sample->mSample.codec = reader->ReadUByte();
-    sample->mSample.medium = reader->ReadUByte();
-    sample->mSample.unk = reader->ReadUByte();
-    sample->mSample.size = reader->ReadUInt32();
-    sample->mSample.tuning = reader->ReadFloat();
-    sample->mSample.loop = LoadChild<AdpcmLoopData*>(reader->ReadUInt64());
-    sample->mSample.book = LoadChild<AdpcmBookData*>(reader->ReadUInt64());
-    sample->mSample.sampleAddr = new uint8_t[sample->mSample.size];
-    reader->Read((char*) sample->mSample.sampleAddr, sample->mSample.size);
-
-    if(sample->mSample.codec == 2){
-        sample->mSample.medium = 2;
-        for(size_t i = 0; i < sample->mSample.size / 2; i++){
-            int16_t* sampleData = (int16_t*) sample->mSample.sampleAddr;
-            sampleData[i] = BSWAP16(sampleData[i]);
+    tinyxml2::XMLElement* loopRoot = child->FirstChildElement("ADPCMLoop");
+    if (loopRoot != nullptr) {
+        size_t i = 0;
+        sample->mSample.loop = new AdpcmLoopData();
+        sample->mSample.loop->start = loopRoot->UnsignedAttribute("Start");
+        sample->mSample.loop->end = loopRoot->UnsignedAttribute("End");
+        sample->mSample.loop->count = loopRoot->UnsignedAttribute("Count");
+        tinyxml2::XMLElement* predictor = loopRoot->FirstChildElement("Predictor");
+        while (predictor != nullptr) {
+            sample->mSample.loop->predictorState[i++] = predictor->IntAttribute("State");
+            predictor = predictor->NextSiblingElement();
         }
-    } else {
-        sample->mSample.medium = 0;
     }
 
-    sample->mSample.isRelocated = 1;
+    tinyxml2::XMLElement* bookRoot = child->FirstChildElement("ADPCMBook");
+    if (bookRoot != nullptr) {
+        size_t i = 0;
+        sample->mSample.book = new AdpcmBookData();
+        sample->mSample.book->numPredictors = bookRoot->IntAttribute("Npredictors");
+        sample->mSample.book->order = bookRoot->IntAttribute("Order");
+        tinyxml2::XMLElement* book = bookRoot->FirstChildElement("Book");
+        size_t numBooks = sample->mSample.book->numPredictors * sample->mSample.book->order * 8;
+        sample->mSample.book->book = new int16_t[numBooks];
+        while (book != nullptr) {
+            sample->mSample.book->book[i++] = book->IntAttribute("Page");
+            book = book->NextSiblingElement();
+        }
+    }
+
+    size_t size = child->Int64Attribute("Size");
+    sample->mSample.size = size;
+
+    const char* path = child->Attribute("Path");
+    initData->Path = path;
+    initData->IsCustom = false;
+    initData->ByteOrder = Ship::Endianness::Native;
+    auto sampleFile = Ship::Context::GetInstance()->GetResourceManager()->GetArchiveManager()->LoadFile(path, initData);
+    if (customFormatStr != nullptr) {
+        // Compressed files can take a really long time to decode (~250ms per).
+        // This worked when we tested it (09/04/2024) (Works on my machine)
+        if (strcmp(customFormatStr, "wav") == 0) {
+            drwav wav;
+            drwav_uint64 numFrames;
+
+            drwav_bool32 ret =
+                drwav_init_memory(&wav, sampleFile->Buffer.get()->data(), sampleFile->Buffer.get()->size(), nullptr);
+
+            drwav_get_length_in_pcm_frames(&wav, &numFrames);
+
+            sample->mSample.tuning = (wav.sampleRate * wav.channels) / 32000.0f;
+            sample->mSample.sampleAddr = new uint8_t[numFrames * wav.channels * 2];
+
+            drwav_read_pcm_frames_s16(&wav, numFrames, (int16_t*)sample->mSample.sampleAddr);
+            return sample;
+        } else if (strcmp(customFormatStr, "ogg") == 0) {
+            std::thread fileDecoderThread = std::thread(OggDecoderWorker, sample, sampleFile);
+            fileDecoderThread.detach();
+            return sample;
+        } else if (strcmp(customFormatStr, "mp3") == 0) {
+            std::thread fileDecoderThread = std::thread(Mp3DecoderWorker, sample, sampleFile);
+            fileDecoderThread.detach();
+            return sample;
+        }
+    }
+    // Not a normal streamed sample. Fallback to the original ADPCM sample to be decoded by the audio engine.
+    sample->mSample.sampleAddr = new uint8_t[size];
+    // Can't use memcpy due to endianness issues.
+    for (uint32_t i = 0; i < size; i++) {
+        sample->mSample.sampleAddr[i] = (*sampleFile->Buffer)[i];
+    }
 
     return sample;
 }
+
+uint8_t ResourceFactoryXMLSampleV0::CodecStrToInt(const char* str, const char* file) {
+    if (strcmp("ADPCM", str) == 0) {
+        return CODEC_ADPCM;
+    } else if (strcmp("S8", str) == 0) {
+        return CODEC_S8;
+    } else if (strcmp("S16MEM", str) == 0) {
+        return CODEC_S16_INMEMORY;
+    } else if (strcmp("ADPCMSMALL", str) == 0) {
+        return CODEC_SMALL_ADPCM;
+    } else if (strcmp("REVERB", str) == 0) {
+        return CODEC_REVERB;
+    } else if (strcmp("S16", str) == 0) {
+        return CODEC_S16;
+    } else {
+        char buff[2048];
+        snprintf(buff, 2048,
+                 "Invalid codec in %s. Got %s, expected ADPCM, S8, S16MEM, ADPCMSMALL, REVERB, S16, UNK6, UNK7.", file,
+                 str);
+        throw std::runtime_error(buff);
+    }
+}
+
+uint32_t ResourceFactoryXMLSampleV0::MediumStrToInt(const char* str) {
+    if (!strcmp("Ram", str)) {
+        return 0;
+    } else if (!strcmp("Unk", str)) {
+        return 1;
+    } else if (!strcmp("Cart", str)) {
+        return 2;
+    } else if (!strcmp("Disk", str)) {
+        return 3;
+        // 4 is skipped
+    } else if (!strcmp("RamUnloaded", str)) {
+        return 5;
+    } else {
+        char buff[2048];
+        snprintf(buff, 2048,
+                 "Bad medium value. Got %s, expected Ram, Unk, Cart, or Disk.", str);
+        throw std::runtime_error(buff);
+    }
+}
 } // namespace LUS
diff --git a/src/port/resource/importers/audio/SampleFactory.h b/src/port/resource/importers/audio/SampleFactory.h
index 4b2017eb..01fbf97e 100644
--- a/src/port/resource/importers/audio/SampleFactory.h
+++ b/src/port/resource/importers/audio/SampleFactory.h
@@ -1,16 +1,26 @@
 #pragma once
 
 #include "Resource.h"
+#include "ResourceFactoryXML.h"
 #include "ResourceFactoryBinary.h"
 
 namespace SF64 {
+struct OggFileData {
+    void* data;
+    size_t pos;
+    size_t size;
+};
+
 class ResourceFactoryBinarySampleV1 : public Ship::ResourceFactoryBinary {
   public:
     std::shared_ptr<Ship::IResource> ReadResource(std::shared_ptr<Ship::File> file) override;
 };
 
-class ResourceFactoryBinarySampleV2 : public Ship::ResourceFactoryBinary {
+class ResourceFactoryXMLSampleV0 : public Ship::ResourceFactoryXML {
   public:
     std::shared_ptr<Ship::IResource> ReadResource(std::shared_ptr<Ship::File> file) override;
+  private:
+    static uint8_t CodecStrToInt(const char* str, const char* file);
+    static uint32_t MediumStrToInt(const char* str);
 };
 }; // namespace LUS
diff --git a/tools/Torch b/tools/Torch
index 28dcd128..053d97a4 160000
--- a/tools/Torch
+++ b/tools/Torch
@@ -1 +1 @@
-Subproject commit 28dcd128b0406a43ab7ef9718213f7ab7d3736f8
+Subproject commit 053d97a433f3cfc9607b7cedb512d2e7ee1dc78a

From 93199b4c02eca1364359e88fdd700386d6fc8bf0 Mon Sep 17 00:00:00 2001
From: KiritoDv <kiritodev01@gmail.com>
Date: Mon, 3 Feb 2025 03:12:20 -0600
Subject: [PATCH 2/6] We did partial success while loading this

---
 src/audio/audio_synthesis.c                   |   2 +-
 src/port/Engine.cpp                           |   3 +
 .../importers/audio/SampleFactory.cpp         |  14 +-
 .../importers/audio/SoundFontFactory.cpp      | 212 ++++++++++++++++++
 .../importers/audio/SoundFontFactory.h        |  15 ++
 tools/Torch                                   |   2 +-
 6 files changed, 241 insertions(+), 7 deletions(-)

diff --git a/src/audio/audio_synthesis.c b/src/audio/audio_synthesis.c
index f360229a..a9f16173 100644
--- a/src/audio/audio_synthesis.c
+++ b/src/audio/audio_synthesis.c
@@ -1042,7 +1042,7 @@ Acmd* AudioSynth_ProcessNote(s32 noteIndex, NoteSubEu* noteSub, NoteSynthesisSta
                         numSamplesProcessed += numSamplesToLoadAdj;
                         dmemUncompressedAddrOffset1 = numSamplesToLoadAdj;
 
-                        if (((synthState->samplePosInt * 2) + (numSamplesToLoadAdj)*SAMPLE_SIZE) < bookSample->size) {
+                        if (((synthState->samplePosInt * 2) + (numSamplesToLoadAdj) * SAMPLE_SIZE) < bookSample->size) {
                             bytesToRead = (numSamplesToLoadAdj) * SAMPLE_SIZE;
                         } else {
                             bytesToRead = bookSample->size - (synthState->samplePosInt * 2);
diff --git a/src/port/Engine.cpp b/src/port/Engine.cpp
index aaebe67a..dcedb08d 100644
--- a/src/port/Engine.cpp
+++ b/src/port/Engine.cpp
@@ -244,6 +244,9 @@ GameEngine::GameEngine() {
     loader->RegisterResourceFactory(std::make_shared<SF64::ResourceFactoryBinarySoundFontV0>(), RESOURCE_FORMAT_BINARY,
                                     "SoundFont", static_cast<uint32_t>(SF64::ResourceType::SoundFont), 0);
 
+    loader->RegisterResourceFactory(std::make_shared<SF64::ResourceFactoryXMLSoundFontV0>(), RESOURCE_FORMAT_XML,
+                                    "SoundFont", static_cast<uint32_t>(SF64::ResourceType::SoundFont), 0);
+
     prevAltAssets = CVarGetInteger("gEnhancements.Mods.AlternateAssets", 0);
     gEnableGammaBoost = CVarGetInteger("gGraphics.GammaMode", 0) == 0;
     context->GetResourceManager()->SetAltAssetsEnabled(prevAltAssets);
diff --git a/src/port/resource/importers/audio/SampleFactory.cpp b/src/port/resource/importers/audio/SampleFactory.cpp
index 8289266a..994a9131 100644
--- a/src/port/resource/importers/audio/SampleFactory.cpp
+++ b/src/port/resource/importers/audio/SampleFactory.cpp
@@ -101,12 +101,14 @@ static void Mp3DecoderWorker(std::shared_ptr<Sample> sample, std::shared_ptr<Shi
     drmp3 mp3;
     drwav_uint64 numFrames;
     drmp3_bool32 ret =
-        drmp3_init_memory(&mp3, sampleFile->Buffer.get()->data(), sampleFile->Buffer.get()->size(), nullptr);
+        drmp3_init_memory(&mp3, sampleFile->Buffer->data(), sampleFile->Buffer->size(), nullptr);
     numFrames = drmp3_get_pcm_frame_count(&mp3);
     drwav_uint64 channels = mp3.channels;
     drwav_uint64 sampleRate = mp3.sampleRate;
 
-    sample->mSample.sampleAddr = new uint8_t[numFrames * channels * 2];
+    sample->mSample.tuning = (float)(sampleRate * channels) / 32000.0f;
+    sample->mSample.size = numFrames * channels * 2;
+    sample->mSample.sampleAddr = new uint8_t[sample->mSample.size];
     drmp3_read_pcm_frames_s16(&mp3, numFrames, (int16_t*)sample->mSample.sampleAddr);
 }
 
@@ -131,6 +133,7 @@ static void OggDecoderWorker(std::shared_ptr<Sample> sample, std::shared_ptr<Shi
     int bitStream = 0;
     size_t toRead = numFrames * numChannels * 2;
     sample->mSample.sampleAddr = new uint8_t[toRead];
+    sample->mSample.tuning = (float)(sampleRate * numChannels) / 32000.0f;
     do {
         read = ov_read(&vf, dataBuff, 4096, 0, 2, 1, &bitStream);
         memcpy(sample->mSample.sampleAddr + pos, dataBuff, read);
@@ -199,12 +202,13 @@ std::shared_ptr<Ship::IResource> ResourceFactoryXMLSampleV0::ReadResource(std::s
             drwav_uint64 numFrames;
 
             drwav_bool32 ret =
-                drwav_init_memory(&wav, sampleFile->Buffer.get()->data(), sampleFile->Buffer.get()->size(), nullptr);
+                drwav_init_memory(&wav, sampleFile->Buffer->data(), sampleFile->Buffer->size(), nullptr);
 
             drwav_get_length_in_pcm_frames(&wav, &numFrames);
 
-            sample->mSample.tuning = (wav.sampleRate * wav.channels) / 32000.0f;
-            sample->mSample.sampleAddr = new uint8_t[numFrames * wav.channels * 2];
+            sample->mSample.tuning = (float)(wav.sampleRate * wav.channels) / 32000.0f;
+            sample->mSample.size = numFrames * wav.channels * 2;
+            sample->mSample.sampleAddr = new uint8_t[sample->mSample.size];
 
             drwav_read_pcm_frames_s16(&wav, numFrames, (int16_t*)sample->mSample.sampleAddr);
             return sample;
diff --git a/src/port/resource/importers/audio/SoundFontFactory.cpp b/src/port/resource/importers/audio/SoundFontFactory.cpp
index 431f9410..1fda2854 100644
--- a/src/port/resource/importers/audio/SoundFontFactory.cpp
+++ b/src/port/resource/importers/audio/SoundFontFactory.cpp
@@ -1,5 +1,7 @@
 #include "SoundFontFactory.h"
 #include "../ResourceUtil.h"
+#include "utils/StringHelper.h"
+#include <sf64audio_provisional.h>
 #include "port/resource/type/audio/SoundFont.h"
 
 namespace SF64 {
@@ -29,4 +31,214 @@ std::shared_ptr<Ship::IResource> ResourceFactoryBinarySoundFontV0::ReadResource(
 
     return font;
 }
+
+int8_t ResourceFactoryXMLSoundFontV0::MediumStrToInt(const char* str) {
+    if (!strcmp("Ram", str)) {
+        return MEDIUM_RAM;
+    } else if (!strcmp("Unk", str)) {
+        return MEDIUM_UNK;
+    } else if (!strcmp("Cart", str)) {
+        return MEDIUM_CART;
+    } else if (!strcmp("Disk", str)) {
+        return MEDIUM_DISK_DRIVE;
+        // 4 is skipped
+    } else {
+        throw std::runtime_error(
+            StringHelper::Sprintf("Bad medium value. Got %s, expected Ram, Unk, Cart, or Disk.", str));
+    }
+}
+
+int8_t ResourceFactoryXMLSoundFontV0::CachePolicyToInt(const char* str) {
+    if (!strcmp("Temporary", str)) {
+        return CACHE_TEMPORARY;
+    } else if (!strcmp("Persistent", str)) {
+        return CACHE_PERSISTENT;
+    } else if (!strcmp("Either", str)) {
+        return CACHE_EITHER;
+    } else if (!strcmp("Permanent", str)) {
+        return CACHE_PERMANENT;
+    } else {
+        throw std::runtime_error(StringHelper::Sprintf(
+            "Bad cache policy value. Got %s, expected Temporary, Persistent, Either, or Permanent.", str));
+    }
+}
+
+void ResourceFactoryXMLSoundFontV0::ParseDrums(SoundFont* soundFont, tinyxml2::XMLElement* element) {
+    element = (tinyxml2::XMLElement*)element->FirstChildElement();
+    // No drums
+    if (element == nullptr) {
+        soundFont->mFont.drums = nullptr;
+        soundFont->mFont.numDrums = 0;
+        return;
+    }
+
+    do {
+        auto drum = new DrumData;
+        std::vector<EnvelopePointData> envelopes;
+        drum->adsrDecayIndex = element->IntAttribute("ReleaseRate");
+        drum->pan = element->IntAttribute("Pan");
+        drum->isRelocated = element->IntAttribute("Loaded");
+        drum->tunedSample.tuning = element->FloatAttribute("Tuning");
+        const char* sampleStr = element->Attribute("SampleRef");
+
+        if (sampleStr != nullptr && sampleStr[0] != 0) {
+            auto res = Ship::Context::GetInstance()->GetResourceManager()->LoadResourceProcess(sampleStr);
+            drum->tunedSample.sample = static_cast<SampleData*>(res ? res->GetRawPointer() : nullptr);
+        } else {
+            drum->tunedSample.sample = nullptr;
+        }
+
+        element = (tinyxml2::XMLElement*)element->FirstChildElement();
+        if (!strcmp(element->Name(), "Envelopes")) {
+            // element = (tinyxml2::XMLElement*)element->FirstChildElement();
+            unsigned int envCount = 0;
+            envelopes = ParseEnvelopes(soundFont, element, &envCount);
+            element = (tinyxml2::XMLElement*)element->Parent();
+            drum->envelope = new EnvelopePointData[envelopes.size()];
+            memcpy(drum->envelope, envelopes.data(), envelopes.size() * sizeof(EnvelopePointData));
+        } else {
+            drum->envelope = nullptr;
+        }
+
+        if (drum->tunedSample.sample == nullptr) {
+            soundFont->mDrums.push_back(nullptr);
+        } else {
+            soundFont->mDrums.push_back(drum);
+        }
+
+        element = element->NextSiblingElement();
+    } while (element != nullptr);
+
+    soundFont->mFont.numDrums = soundFont->mDrums.size();
+    soundFont->mFont.drums = soundFont->mDrums.data();
+}
+
+void ResourceFactoryXMLSoundFontV0::ParseInstruments(SoundFont* soundFont, tinyxml2::XMLElement* element) {
+    element = element->FirstChildElement();
+    do {
+        auto instrument = new InstrumentData;
+        unsigned int envCount = 0;
+        std::vector<EnvelopePointData> envelopes;
+
+        int isValid = element->BoolAttribute("IsValid");
+        instrument->isRelocated = element->IntAttribute("Loaded");
+        instrument->normalRangeLo = element->IntAttribute("NormalRangeLo");
+        instrument->normalRangeHi = element->IntAttribute("NormalRangeHi");
+        instrument->adsrDecayIndex = element->IntAttribute("ReleaseRate");
+        tinyxml2::XMLElement* instrumentElement = element->FirstChildElement();
+        tinyxml2::XMLElement* instrumentElementCopy = instrumentElement;
+
+        if (instrumentElement != nullptr && !strcmp(instrumentElement->Name(), "Envelopes")) {
+            envelopes = ParseEnvelopes(soundFont, instrumentElement, &envCount);
+            instrument->envelope = new EnvelopePointData[envelopes.size()];
+            memcpy(instrument->envelope, envelopes.data(), envelopes.size() * sizeof(EnvelopePointData));
+            instrumentElement = instrumentElement->NextSiblingElement();
+        }
+
+        if (instrumentElement != nullptr && !strcmp("LowNotesSound", instrumentElement->Name())) {
+            instrument->lowPitchTunedSample.tuning = instrumentElement->FloatAttribute("Tuning");
+            const char* sampleStr = instrumentElement->Attribute("SampleRef");
+            if (sampleStr != nullptr && sampleStr[0] != 0) {
+                auto res = static_pointer_cast<Sample>(
+                    Ship::Context::GetInstance()->GetResourceManager()->LoadResourceProcess(sampleStr, true));
+                auto sample = static_cast<SampleData*>(res ? res->GetRawPointer() : nullptr);
+                instrument->lowPitchTunedSample.sample = sample;
+                if (sample != nullptr && sample->tuning != 0.0f) {
+                    instrument->lowPitchTunedSample.tuning = sample->tuning;
+                }
+            }
+            instrumentElement = instrumentElement->NextSiblingElement();
+        }
+
+        if (instrumentElement != nullptr && !strcmp("NormalNotesSound", instrumentElement->Name())) {
+            instrument->normalPitchTunedSample.tuning = instrumentElement->FloatAttribute("Tuning");
+            const char* sampleStr = instrumentElement->Attribute("SampleRef");
+            if (sampleStr != nullptr && sampleStr[0] != 0) {
+                auto res = static_pointer_cast<Sample>(
+                    Ship::Context::GetInstance()->GetResourceManager()->LoadResourceProcess(sampleStr, true));
+                auto sample = static_cast<SampleData*>(res ? res->GetRawPointer() : nullptr);
+                instrument->normalPitchTunedSample.sample = sample;
+                if (sample != nullptr && sample->tuning != 0.0f) {
+                    instrument->normalPitchTunedSample.tuning = sample->tuning;
+                }
+            }
+            instrumentElement = instrumentElement->NextSiblingElement();
+        }
+
+        if (instrumentElement != nullptr && !strcmp("HighNotesSound", instrumentElement->Name())) {
+            instrument->highPitchTunedSample.tuning = instrumentElement->FloatAttribute("Tuning");
+            const char* sampleStr = instrumentElement->Attribute("SampleRef");
+            if (sampleStr != nullptr && sampleStr[0] != 0) {
+                auto res = static_pointer_cast<Sample>(
+                    Ship::Context::GetInstance()->GetResourceManager()->LoadResourceProcess(sampleStr, true));
+                auto sample = static_cast<SampleData*>(res ? res->GetRawPointer() : nullptr);
+                instrument->highPitchTunedSample.sample = sample;
+                if (sample != nullptr && sample->tuning != 0.0f) {
+                    instrument->highPitchTunedSample.tuning = sample->tuning;
+                }
+            }
+            instrumentElement = instrumentElement->NextSiblingElement();
+        }
+
+        soundFont->mInstruments.push_back(instrument);
+
+        element = instrumentElementCopy;
+        element = (tinyxml2::XMLElement*)element->Parent();
+        element = element->NextSiblingElement();
+    } while (element != nullptr);
+
+    soundFont->mFont.instruments = soundFont->mInstruments.data();
+    soundFont->mFont.numInstruments = soundFont->mInstruments.size();
+}
+
+std::vector<EnvelopePointData> ResourceFactoryXMLSoundFontV0::ParseEnvelopes(SoundFont* soundFont,
+                                                                             tinyxml2::XMLElement* element,
+                                                                             unsigned int* count) {
+    std::vector<EnvelopePointData> envelopes;
+    unsigned int total = 0;
+    element = element->FirstChildElement("Envelope");
+    while (element != nullptr) {
+        EnvelopePointData env = {
+            .delay = (s16)element->IntAttribute("Delay"),
+            .arg = (s16)element->IntAttribute("Arg"),
+        };
+        envelopes.emplace_back(env);
+        element = element->NextSiblingElement("Envelope");
+        total++;
+    }
+    *count = total;
+    return envelopes;
+}
+
+std::shared_ptr<Ship::IResource> ResourceFactoryXMLSoundFontV0::ReadResource(std::shared_ptr<Ship::File> file) {
+    if (!FileHasValidFormatAndReader(file)) {
+        return nullptr;
+    }
+    auto audioSoundFont = std::make_shared<SoundFont>(file->InitData);
+    auto child = std::get<std::shared_ptr<tinyxml2::XMLDocument>>(file->Reader)->FirstChildElement();
+    // Header data
+    memset(&audioSoundFont->mFont, 0, sizeof(audioSoundFont->mFont));
+
+    auto shortData1 = child->IntAttribute("Data1");
+    auto shortData2 = child->IntAttribute("Data2");
+
+    audioSoundFont->mFont.numInstruments = (shortData2 >> 8) & 0xFFu;
+    audioSoundFont->mFont.numDrums = shortData2 & 0xFFu;
+    audioSoundFont->mFont.sampleBankId1 = (shortData1 >> 8) & 0xFFu;
+    audioSoundFont->mFont.sampleBankId2 = shortData1 & 0xFFu;
+
+    child = (tinyxml2::XMLElement*)child->FirstChildElement();
+
+    while (child != nullptr) {
+        const char* name = child->Name();
+
+        if (!strcmp(name, "Drums")) {
+            ParseDrums(audioSoundFont.get(), child);
+        } else if (!strcmp(name, "Instruments")) {
+            ParseInstruments(audioSoundFont.get(), child);
+        }
+        child = child->NextSiblingElement();
+    }
+    return audioSoundFont;
+}
 } // namespace LUS
diff --git a/src/port/resource/importers/audio/SoundFontFactory.h b/src/port/resource/importers/audio/SoundFontFactory.h
index 04823a55..b5afda9a 100644
--- a/src/port/resource/importers/audio/SoundFontFactory.h
+++ b/src/port/resource/importers/audio/SoundFontFactory.h
@@ -1,11 +1,26 @@
 #pragma once
 
 #include "Resource.h"
+#include "ResourceFactoryXML.h"
 #include "ResourceFactoryBinary.h"
+#include "port/resource/type/audio/SoundFont.h"
 
 namespace SF64 {
 class ResourceFactoryBinarySoundFontV0 : public Ship::ResourceFactoryBinary {
   public:
     std::shared_ptr<Ship::IResource> ReadResource(std::shared_ptr<Ship::File> file) override;
 };
+
+class ResourceFactoryXMLSoundFontV0 : public Ship::ResourceFactoryXML {
+  public:
+    std::shared_ptr<Ship::IResource> ReadResource(std::shared_ptr<Ship::File> file) override;
+    static int8_t MediumStrToInt(const char* str);
+    static int8_t CachePolicyToInt(const char* str);
+
+  private:
+    void ParseDrums(SoundFont* soundFont, tinyxml2::XMLElement* element);
+    void ParseInstruments(SoundFont* soundFont, tinyxml2::XMLElement* element);
+    std::vector<EnvelopePointData> ParseEnvelopes(SoundFont* soundFont, tinyxml2::XMLElement* element,
+                                              unsigned int* count);
+};
 }; // namespace LUS
diff --git a/tools/Torch b/tools/Torch
index 053d97a4..27af7233 160000
--- a/tools/Torch
+++ b/tools/Torch
@@ -1 +1 @@
-Subproject commit 053d97a433f3cfc9607b7cedb512d2e7ee1dc78a
+Subproject commit 27af72331ceba7703f0382bb0316320baed377a3

From 3020f3bd95676d2830a8310b92017880404958ad Mon Sep 17 00:00:00 2001
From: KiritoDv <kiritodev01@gmail.com>
Date: Mon, 3 Feb 2025 11:48:30 -0600
Subject: [PATCH 3/6] Fixed implementation issues with CODEC_S16

---
 src/audio/audio_synthesis.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/audio/audio_synthesis.c b/src/audio/audio_synthesis.c
index a9f16173..93cb7b1b 100644
--- a/src/audio/audio_synthesis.c
+++ b/src/audio/audio_synthesis.c
@@ -1036,21 +1036,23 @@ Acmd* AudioSynth_ProcessNote(s32 noteIndex, NoteSubEu* noteSub, NoteSynthesisSta
                         goto skip;
 
                     case CODEC_S16:
+                        aClearBuffer(cmd, DMEM_UNCOMPRESSED_NOTE, (numSamplesToLoadAdj + 16) * 2);
                         flags = A_CONTINUE;
                         skipBytes = 0;
-                        size_t bytesToRead;
                         numSamplesProcessed += numSamplesToLoadAdj;
-                        dmemUncompressedAddrOffset1 = numSamplesToLoadAdj;
+                        aligned = numSamplesToLoadAdj;
+                        size_t bytesToRead;
 
-                        if (((synthState->samplePosInt * 2) + (numSamplesToLoadAdj) * SAMPLE_SIZE) < bookSample->size) {
-                            bytesToRead = (numSamplesToLoadAdj) * SAMPLE_SIZE;
+                        if (((synthState->samplePosInt * 2) + (numSamplesToLoadAdj + 16) * 2) <
+                            bookSample->size) {
+                            bytesToRead = (numSamplesToLoadAdj + 16) * 2;
                         } else {
                             bytesToRead = bookSample->size - (synthState->samplePosInt * 2);
                         }
                         // @port [Custom audio]
                         // TLDR samples are loaded async and might be null the first time they are played.
                         // See note in AudioSampleFactory.cpp
-                        if (sampleAddr != NULL) {
+                        if ((void*) sampleAddr != NULL) {
                             aLoadBuffer(cmd++, sampleAddr + (synthState->samplePosInt * 2), DMEM_UNCOMPRESSED_NOTE,
                                         bytesToRead);
                         }

From 717414a5c9dcec23829f56f0e0f997ca5c70f09b Mon Sep 17 00:00:00 2001
From: KiritoDv <kiritodev01@gmail.com>
Date: Mon, 3 Feb 2025 14:57:28 -0600
Subject: [PATCH 4/6] Fixed isRelocated

---
 src/port/resource/importers/audio/SampleFactory.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/port/resource/importers/audio/SampleFactory.cpp b/src/port/resource/importers/audio/SampleFactory.cpp
index 994a9131..890641a3 100644
--- a/src/port/resource/importers/audio/SampleFactory.cpp
+++ b/src/port/resource/importers/audio/SampleFactory.cpp
@@ -229,6 +229,8 @@ std::shared_ptr<Ship::IResource> ResourceFactoryXMLSampleV0::ReadResource(std::s
         sample->mSample.sampleAddr[i] = (*sampleFile->Buffer)[i];
     }
 
+    sample->mSample.isRelocated = 1;
+
     return sample;
 }
 

From 4734a5ea068c6ed23f106324364da0332f65f0f8 Mon Sep 17 00:00:00 2001
From: KiritoDv <kiritodev01@gmail.com>
Date: Mon, 3 Feb 2025 17:18:21 -0600
Subject: [PATCH 5/6] Rewritten this thing

---
 src/audio/audio_synthesis.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/src/audio/audio_synthesis.c b/src/audio/audio_synthesis.c
index 93cb7b1b..af3348c0 100644
--- a/src/audio/audio_synthesis.c
+++ b/src/audio/audio_synthesis.c
@@ -1036,26 +1036,12 @@ Acmd* AudioSynth_ProcessNote(s32 noteIndex, NoteSubEu* noteSub, NoteSynthesisSta
                         goto skip;
 
                     case CODEC_S16:
-                        aClearBuffer(cmd, DMEM_UNCOMPRESSED_NOTE, (numSamplesToLoadAdj + 16) * 2);
+                        aLoadBuffer(aList++, OS_K0_TO_PHYSICAL(bookSample->sampleAddr + synthState->samplePosInt * 2), DMEM_UNCOMPRESSED_NOTE,
+                                    (numSamplesToLoadAdj + SAMPLES_PER_FRAME) * 2);
                         flags = A_CONTINUE;
                         skipBytes = 0;
-                        numSamplesProcessed += numSamplesToLoadAdj;
-                        aligned = numSamplesToLoadAdj;
-                        size_t bytesToRead;
-
-                        if (((synthState->samplePosInt * 2) + (numSamplesToLoadAdj + 16) * 2) <
-                            bookSample->size) {
-                            bytesToRead = (numSamplesToLoadAdj + 16) * 2;
-                        } else {
-                            bytesToRead = bookSample->size - (synthState->samplePosInt * 2);
-                        }
-                        // @port [Custom audio]
-                        // TLDR samples are loaded async and might be null the first time they are played.
-                        // See note in AudioSampleFactory.cpp
-                        if ((void*) sampleAddr != NULL) {
-                            aLoadBuffer(cmd++, sampleAddr + (synthState->samplePosInt * 2), DMEM_UNCOMPRESSED_NOTE,
-                                        bytesToRead);
-                        }
+                        numSamplesProcessed = numSamplesToLoadAdj;
+                        dmemUncompressedAddrOffset1 = numSamplesToLoadAdj;
 
                         goto skip;
                 }

From ddf9db7bb78fd276902f52487e0c04ce9c89772f Mon Sep 17 00:00:00 2001
From: coco875 <59367621+coco875@users.noreply.github.com>
Date: Tue, 4 Feb 2025 23:15:21 +0100
Subject: [PATCH 6/6] Sound optimisation (#150)

* add sse2neon and optimise aEnvMixerImpl

* optimise aResampleImpl

* optimise aMixImpl

* optimise aADPCMdecImpl
---
 CMakeLists.txt    |   6 +
 src/audio/mixer.c | 475 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 481 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ffe5a58f..525123b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -189,6 +189,12 @@ if (MSVC)
   endif()
 endif()
 
+#=================== SSE2NEON ===================
+set(SSE2NEON_DIR ${CMAKE_BINARY_DIR}/_deps/sse2neon)
+file(DOWNLOAD "https://raw.githubusercontent.com/DLTcollab/sse2neon/refs/heads/master/sse2neon.h" "${SSE2NEON_DIR}/sse2neon.h")
+
+include_directories(${SSE2NEON_DIR})
+
 FetchContent_Declare(
     dr_libs
     GIT_REPOSITORY https://github.com/mackron/dr_libs.git
diff --git a/src/audio/mixer.c b/src/audio/mixer.c
index d9da7769..73254973 100644
--- a/src/audio/mixer.c
+++ b/src/audio/mixer.c
@@ -3,12 +3,68 @@
 #include <string.h>
 #include <stdio.h>
 
+#include <macros.h>
+
 #include "mixer.h"
 
 #ifndef __clang__
 #pragma GCC optimize ("unroll-loops")
 #endif
 
+#if defined(__SSE2__) || defined(__aarch64__)
+#define SSE2_AVAILABLE
+#else
+#pragma message("Warning: SSE2 support is not available. Code will not compile")
+#endif
+
+#if defined(__SSE2__)
+#include <emmintrin.h>
+#elif defined(__aarch64__)
+#include "sse2neon.h"
+#endif
+
+#ifdef SSE2_AVAILABLE
+typedef struct {
+    __m128i lo, hi;
+} m256i;
+
+static m256i m256i_mul_epi16(__m128i a, __m128i b) {
+    m256i res;
+    res.lo = _mm_mullo_epi16(a, b);
+    res.hi = _mm_mulhi_epi16(a, b);
+
+    m256i ret;
+    ret.lo = _mm_unpacklo_epi16(res.lo, res.hi);
+    ret.hi = _mm_unpackhi_epi16(res.lo, res.hi);
+    return ret;
+}
+
+static m256i m256i_add_m256i_epi32(m256i a, m256i b) {
+    m256i res;
+    res.lo = _mm_add_epi32(a.lo, b.lo);
+    res.hi = _mm_add_epi32(a.hi, b.hi);
+    return res;
+}
+
+static m256i m256i_add_m128i_epi32(m256i a, __m128i b) {
+    m256i res;
+    res.lo = _mm_add_epi32(a.lo, b);
+    res.hi = _mm_add_epi32(a.hi, b);
+    return res;
+}
+
+static m256i m256i_srai(m256i a, int b) {
+    m256i res;
+    res.lo = _mm_srai_epi32(a.lo, b);
+    res.hi = _mm_srai_epi32(a.hi, b);
+    return res;
+}
+
+static __m128i m256i_clamp_to_m128i(m256i a) {
+    return _mm_packs_epi32(a.lo, a.hi);
+}
+#endif
+
 #define ROUND_UP_64(v) (((v) + 63) & ~63)
 #define ROUND_UP_32(v) (((v) + 31) & ~31)
 #define ROUND_UP_16(v) (((v) + 15) & ~15)
@@ -218,6 +274,8 @@ void aSetLoopImpl(ADPCM_STATE *adpcm_loop_state) {
     rspa.adpcm_loop_state = adpcm_loop_state;
 }
 
+#ifndef SSE2_AVAILABLE
+
 void aADPCMdecImpl(uint8_t flags, ADPCM_STATE state) {
     uint8_t *in = BUF_U8(rspa.in);
     int16_t *out = BUF_S16(rspa.out);
@@ -269,6 +327,133 @@ void aADPCMdecImpl(uint8_t flags, ADPCM_STATE state) {
     memcpy(state, out - 16, 16 * sizeof(int16_t));
 }
 
+#else
+
+static uint16_t lower_4bit[] = {
+    0xf,
+    0xf,
+    0xf,
+    0xf,
+};
+
+static uint16_t lower_2bit[] = {
+    0x3,
+    0x3,
+};
+
+void aADPCMdecImpl(uint8_t flags, ADPCM_STATE state) {
+    uint8_t* in = BUF_U8(rspa.in);
+    int16_t* out = BUF_S16(rspa.out);
+    int nbytes = ROUND_UP_32(rspa.nbytes);
+    if (flags & A_INIT) {
+        memset(out, 0, 16 * sizeof(int16_t));
+    } else if (flags & A_LOOP) {
+        memcpy(out, rspa.adpcm_loop_state, 16 * sizeof(int16_t));
+    } else {
+        memcpy(out, state, 16 * sizeof(int16_t));
+    }
+    out += 16;
+
+    __m128i mask_4bit = _mm_loadl_epi64((__m128i*) lower_4bit);
+    __m128i mask_2bit = _mm_loadl_epi64((__m128i*) lower_2bit);
+
+    while (nbytes > 0) {
+        int shift = *in >> 4; // should be in 0..12 or 0..14
+        __m128i shift_vec = _mm_set1_epi16(shift);
+        int table_index = *in++ & 0xf; // should be in 0..7
+        int16_t(*tbl)[8] = rspa.adpcm_table[table_index];
+
+        for (int i = 0; i < 2; i++) {
+            int16_t ins[8];
+            int16_t prev1 = out[-1];
+            int16_t prev2 = out[-2];
+            __m128i prev1_vec = _mm_set1_epi16(prev1);
+            __m128i prev2_vec = _mm_set1_epi16(prev2);
+
+            __m128i ins_vec;
+            if (flags & 4) {
+                ins_vec = _mm_loadu_si16((__m128i*) in);
+                ins_vec = _mm_unpacklo_epi8(ins_vec, _mm_setzero_si128());
+                __m128i in_vec_up2bit = _mm_srli_epi16(ins_vec, 6);
+                __m128i in_vec_uplower2bit = _mm_and_si128(_mm_srli_epi16(ins_vec, 4), mask_2bit);
+                __m128i in_vec_lowerup2bit = _mm_and_si128(_mm_srli_epi16(ins_vec, 2), mask_2bit);
+                __m128i in_vec_lower2bit = _mm_and_si128(ins_vec, mask_2bit);
+                __m128i in_vec_up = _mm_unpacklo_epi16(in_vec_up2bit, in_vec_uplower2bit);
+                in_vec_up = _mm_shuffle_epi32(in_vec_up, _MM_SHUFFLE(3, 1, 2, 0));
+                __m128i in_vec_low = _mm_unpacklo_epi16(in_vec_lower2bit, in_vec_lowerup2bit);
+                in_vec_low = _mm_shuffle_epi32(in_vec_low, _MM_SHUFFLE(3, 1, 2, 0));
+                ins_vec = _mm_unpacklo_epi32(in_vec_up, in_vec_low);
+                ins_vec = _mm_slli_epi16(ins_vec, 14);
+                ins_vec = _mm_srai_epi16(ins_vec, 14);
+                ins_vec = _mm_slli_epi16(ins_vec, shift);
+                
+                in += 2;
+            } else {
+                ins_vec = _mm_loadu_si32((__m128i*) in);
+                ins_vec = _mm_unpacklo_epi8(ins_vec, _mm_setzero_si128());
+                __m128i in_vec_up4bit = _mm_srli_epi16(ins_vec, 4);
+                __m128i in_vec_lower4bit = _mm_and_si128(ins_vec, mask_4bit);
+                ins_vec = _mm_unpacklo_epi16(in_vec_up4bit, in_vec_lower4bit);
+                ins_vec = _mm_slli_epi16(ins_vec, 12);
+                ins_vec = _mm_srai_epi16(ins_vec, 12);
+                ins_vec = _mm_slli_epi16(ins_vec, shift);
+
+                in += 4;
+            }
+            _mm_storeu_si128((__m128i*) ins, ins_vec);
+
+            for (int j = 0; j < 2; j++) {
+                __m128i tbl0_vec = _mm_loadu_si64((__m128i*) (tbl[0] + (j * 4)));
+                __m128i tbl1_vec = _mm_loadu_si64((__m128i*) (tbl[1] + (j * 4)));
+
+                m256i res;
+                res.lo = _mm_mullo_epi16(tbl0_vec, prev2_vec);
+                res.hi = _mm_mulhi_epi16(tbl0_vec, prev2_vec);
+
+                tbl0_vec = _mm_unpacklo_epi16(res.lo, res.hi);
+
+                res.lo = _mm_mullo_epi16(tbl1_vec, prev1_vec);
+                res.hi = _mm_mulhi_epi16(tbl1_vec, prev1_vec);
+
+                tbl1_vec = _mm_unpacklo_epi16(res.lo, res.hi);
+                __m128i acc_vec = _mm_add_epi32(tbl0_vec, tbl1_vec);
+
+                __m128i shift_ins = _mm_srai_epi32(j ? _mm_unpackhi_epi16(_mm_setzero_si128(), ins_vec)
+                                                     : _mm_unpacklo_epi16(_mm_setzero_si128(), ins_vec),
+                                                   5);
+                acc_vec = _mm_add_epi32(acc_vec, shift_ins);
+
+                tbl1_vec = _mm_loadu_si128((__m128i*) tbl[1]);
+                if (j == 0) {
+                    tbl1_vec = _mm_slli_si128(tbl1_vec, (1 - 0) * 8 + 2);
+                } else {
+                    tbl1_vec = _mm_slli_si128(tbl1_vec, (1 - 1) * 8 + 2);
+                }
+                for (int k = 0; k < ((j + 1) * 4); k++) {
+                    __m128i ins_vec2 = _mm_set1_epi16(ins[k]);
+                    res.lo = _mm_mullo_epi16(tbl1_vec, ins_vec2);
+                    res.hi = _mm_mulhi_epi16(tbl1_vec, ins_vec2);
+
+                    __m128i mult = _mm_unpackhi_epi16(res.lo, res.hi);
+                    acc_vec = _mm_add_epi32(acc_vec, mult);
+                    tbl1_vec = _mm_slli_si128(tbl1_vec, 2);
+                }
+
+                acc_vec = _mm_srai_epi32(acc_vec, 11);
+                acc_vec = _mm_packs_epi32(acc_vec, _mm_setzero_si128());
+                _mm_storeu_si64((__m128*) out, acc_vec);
+                out += 4;
+            }
+        }
+        nbytes -= 16 * sizeof(int16_t);
+    }
+    memcpy(state, out - 16, 16 * sizeof(int16_t));
+}
+
+#endif
+
+#ifndef SSE2_AVAILABLE
+
 void aResampleImpl(uint8_t flags, uint16_t pitch, RESAMPLE_STATE state) {
     int16_t tmp[16];
     int16_t *in_initial = BUF_S16(rspa.in);
@@ -320,6 +505,171 @@ void aResampleImpl(uint8_t flags, uint16_t pitch, RESAMPLE_STATE state) {
     memcpy(state + 8, in, 8 * sizeof(int16_t));
 }
 
+#else
+
+static const ALIGN_ASSET(16) int32_t x4000[4] = {
+    0x4000,
+    0x4000,
+    0x4000,
+    0x4000,
+};
+
+static void mm128_transpose(__m128i* r0, __m128i* r1, __m128i* r2, __m128i* r3) {
+    __m128 tmp0, tmp1, tmp2, tmp3;
+    __m128 row0, row1, row2, row3;
+
+    row0 = _mm_castsi128_ps(*r0);
+    row1 = _mm_castsi128_ps(*r1);
+    row2 = _mm_castsi128_ps(*r2);
+    row3 = _mm_castsi128_ps(*r3);
+
+    tmp0 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(2, 0, 2, 0)); // 0 2 4 6
+    tmp1 = _mm_shuffle_ps(row0, row1, _MM_SHUFFLE(3, 1, 3, 1)); // 1 3 5 7
+    tmp2 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(2, 0, 2, 0)); // 8 a c e
+    tmp3 = _mm_shuffle_ps(row2, row3, _MM_SHUFFLE(3, 1, 3, 1)); // 9 b d f
+
+    row0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2, 0, 2, 0)); // 0 4 8 c
+    row1 = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2, 0, 2, 0)); // 1 5 9 d
+    row2 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3, 1, 3, 1)); // 2 6 a e
+    row3 = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3, 1, 3, 1)); // 3 7 b f
+
+    *r0 = _mm_castps_si128(row0);
+    *r1 = _mm_castps_si128(row1);
+    *r2 = _mm_castps_si128(row2);
+    *r3 = _mm_castps_si128(row3);
+}
+
+static __m128i move_two_4x16(int16_t* a, int16_t* b) {
+    return _mm_set_epi64(_mm_movepi64_pi64(_mm_loadl_epi64((__m128i*) a)),
+                         _mm_movepi64_pi64(_mm_loadl_epi64((__m128i*) b)));
+}
+
+void aResampleImpl(uint8_t flags, uint16_t pitch, RESAMPLE_STATE state) {
+    int16_t tmp[32];
+    int16_t* in_initial = BUF_S16(rspa.in);
+    int16_t* in = in_initial;
+    int16_t* out = BUF_S16(rspa.out);
+    int nbytes = ROUND_UP_16(rspa.nbytes);
+    uint32_t pitch_accumulator;
+    int i;
+
+    if (flags & A_INIT) {
+        memset(tmp, 0, 5 * sizeof(int16_t));
+    } else {
+        memcpy(tmp, state, 16 * sizeof(int16_t));
+    }
+    if (flags & 2) {
+        memcpy(in - 8, tmp + 8, 8 * sizeof(int16_t));
+        in -= tmp[5] / sizeof(int16_t);
+    }
+    in -= 4;
+    pitch_accumulator = (uint16_t) tmp[4];
+    memcpy(in, tmp, 4 * sizeof(int16_t));
+
+    __m128i x4000Vec = _mm_load_si128((__m128i*) x4000);
+
+    do {
+        for (i = 0; i < 2; i++) {
+            int16_t* tbl0 = resample_table[pitch_accumulator * 64 >> 16];
+
+            int16_t* in0 = in;
+
+            pitch_accumulator += (pitch << 1);
+            in += pitch_accumulator >> 16;
+            pitch_accumulator %= 0x10000;
+
+            int16_t* tbl1 = resample_table[pitch_accumulator * 64 >> 16];
+
+            int16_t* in1 = in;
+
+            pitch_accumulator += (pitch << 1);
+            in += pitch_accumulator >> 16;
+            pitch_accumulator %= 0x10000;
+
+            int16_t* tbl2 = resample_table[pitch_accumulator * 64 >> 16];
+
+            int16_t* in2 = in;
+
+            pitch_accumulator += (pitch << 1);
+            in += pitch_accumulator >> 16;
+            pitch_accumulator %= 0x10000;
+
+            int16_t* tbl3 = resample_table[pitch_accumulator * 64 >> 16];
+
+            int16_t* in3 = in;
+
+            pitch_accumulator += (pitch << 1);
+            in += pitch_accumulator >> 16;
+            pitch_accumulator %= 0x10000;
+
+            __m128i vec_in0 = move_two_4x16(in1, in0);
+
+            __m128i vec_tbl0 = move_two_4x16(tbl1, tbl0);
+
+            __m128i vec_in1 = move_two_4x16(in3, in2);
+
+            __m128i vec_tbl1 = move_two_4x16(tbl3, tbl2);
+
+            // we multiply in by tbl
+
+            m256i res;
+            res.lo = _mm_mullo_epi16(vec_in0, vec_tbl0);
+            res.hi = _mm_mulhi_epi16(vec_in0, vec_tbl0);
+
+            __m128i out0_vec = _mm_unpacklo_epi16(res.lo, res.hi);
+            __m128i out1_vec = _mm_unpackhi_epi16(res.lo, res.hi);
+
+            res.lo = _mm_mullo_epi16(vec_in1, vec_tbl1);
+            res.hi = _mm_mulhi_epi16(vec_in1, vec_tbl1);
+
+            __m128i out2_vec = _mm_unpacklo_epi16(res.lo, res.hi);
+            __m128i out3_vec = _mm_unpackhi_epi16(res.lo, res.hi);
+
+            // transpose to more easily make a sum at the end
+
+            mm128_transpose(&out0_vec, &out1_vec, &out2_vec, &out3_vec);
+
+            // add 0x4000
+
+            out0_vec = _mm_add_epi32(out0_vec, x4000Vec);
+            out1_vec = _mm_add_epi32(out1_vec, x4000Vec);
+            out2_vec = _mm_add_epi32(out2_vec, x4000Vec);
+            out3_vec = _mm_add_epi32(out3_vec, x4000Vec);
+
+            // shift by 15
+
+            out0_vec = _mm_srai_epi32(out0_vec, 15);
+            out1_vec = _mm_srai_epi32(out1_vec, 15);
+            out2_vec = _mm_srai_epi32(out2_vec, 15);
+            out3_vec = _mm_srai_epi32(out3_vec, 15);
+
+            // sum all to make sample
+            __m128i sample_vec = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(out0_vec, out1_vec), out2_vec), out3_vec);
+
+            // at the end we do this below but four time
+            // sample = ((in[0] * tbl[0] + 0x4000) >> 15) + ((in[1] * tbl[1] + 0x4000) >> 15) +
+            //          ((in[2] * tbl[2] + 0x4000) >> 15) + ((in[3] * tbl[3] + 0x4000) >> 15);
+            sample_vec = _mm_packs_epi32(sample_vec, _mm_setzero_si128());
+            _mm_storeu_si64(out, sample_vec);
+
+            out += 4;
+        }
+        nbytes -= 8 * sizeof(int16_t);
+    } while (nbytes > 0);
+
+    state[4] = (int16_t) pitch_accumulator;
+    memcpy(state, in, 4 * sizeof(int16_t));
+    i = (in - in_initial + 4) & 7;
+    in -= i;
+    if (i != 0) {
+        i = -8 - i;
+    }
+    state[5] = i;
+    memcpy(state + 8, in, 8 * sizeof(int16_t));
+}
+
+#endif
+
 void aEnvSetup1Impl(uint8_t initial_vol_wet, uint16_t rate_wet, uint16_t rate_left, uint16_t rate_right) {
     rspa.vol_wet = (uint16_t)(initial_vol_wet << 8);
     rspa.rate_wet = rate_wet;
@@ -332,6 +682,8 @@ void aEnvSetup2Impl(uint16_t initial_vol_left, uint16_t initial_vol_right) {
     rspa.vol[1] = initial_vol_right;
 }
 
+#ifndef SSE2_AVAILABLE
+
 void aEnvMixerImpl(uint16_t in_addr, uint16_t n_samples, bool swap_reverb,
 				   bool neg_3, bool neg_2,
                    bool neg_left, bool neg_right,
@@ -368,6 +720,64 @@ void aEnvMixerImpl(uint16_t in_addr, uint16_t n_samples, bool swap_reverb,
     } while (n > 0);
 }
 
+#else
+// SSE2 optimized version of algorithm
+void aEnvMixerImpl(uint16_t in_addr, uint16_t n_samples, bool swap_reverb,
+				   bool neg_3, bool neg_2,
+                   bool neg_left, bool neg_right,
+                   int32_t wet_dry_addr, u32 unk)
+{
+    int16_t *in = BUF_S16(in_addr);
+    int16_t *dry[2] = {BUF_S16(((wet_dry_addr >> 24) & 0xFF) << 4), BUF_S16(((wet_dry_addr >> 16) & 0xFF) << 4)};
+    int16_t *wet[2] = {BUF_S16(((wet_dry_addr >> 8) & 0xFF) << 4), BUF_S16(((wet_dry_addr) & 0xFF) << 4)};
+    int16_t negs[4] = {neg_left ? -1 : 0, neg_right ? -1 : 0, neg_3 ? -4 : 0, neg_2 ? -2 : 0};
+    int n = ROUND_UP_16(n_samples);
+    const int n_aligned = n - (n % 8);
+
+    uint16_t vols[2] = {rspa.vol[0], rspa.vol[1]};
+    uint16_t rates[2] = {rspa.rate[0], rspa.rate[1]};
+    uint16_t vol_wet = rspa.vol_wet;
+    uint16_t rate_wet = rspa.rate_wet;
+
+    const __m128i* in_ptr = (__m128i*)in;
+    const __m128i* d_ptr[2] = { (__m128i*) dry[0], (__m128i*) dry[1] };
+    const __m128i* w_ptr[2] = { (__m128i*) wet[0], (__m128i*) wet[1] };
+
+    // Aligned loop
+    for (int N = 0; N < n_aligned; N+=8) {
+
+        // Init vectors
+        const __m128i in_channels = _mm_loadu_si128(in_ptr++);
+        __m128i d[2] = { _mm_loadu_si128(d_ptr[0]), _mm_loadu_si128(d_ptr[1]) };
+        __m128i w[2] = { _mm_loadu_si128(w_ptr[0]), _mm_loadu_si128(w_ptr[1]) };
+
+        // Compute base samples
+        // sample = ((in * vols) >> 16) ^ negs
+        __m128i s[2] = {
+            _mm_xor_si128(_mm_mulhi_epi16(in_channels, _mm_set1_epi16(vols[0])), _mm_set1_epi16(negs[0])),
+            _mm_xor_si128(_mm_mulhi_epi16(in_channels, _mm_set1_epi16(vols[1])), _mm_set1_epi16(negs[1]))
+        };
+
+        // Compute left swapped samples
+        // (sample * vol_wet) >> 16) ^ negs
+        __m128i ss[2] = {
+            _mm_xor_si128(_mm_mulhi_epi16(s[swap_reverb], _mm_set1_epi16(vol_wet)), _mm_set1_epi16(negs[2])),
+            _mm_xor_si128(_mm_mulhi_epi16(s[!swap_reverb], _mm_set1_epi16(vol_wet)), _mm_set1_epi16(negs[3]))
+        };
+
+        // Store values to buffers
+        for (int j = 0; j < 2; j++) {
+            _mm_storeu_si128((__m128i*) d_ptr[j]++, _mm_adds_epi16(s[j], d[j]));
+            _mm_storeu_si128((__m128i*) w_ptr[j]++, _mm_adds_epi16(ss[j], w[j]));
+            vols[j] += rates[j];
+        }
+        vol_wet += rate_wet;
+    }
+}
+#endif
+
+#ifndef SSE2_AVAILABLE
+
 void aMixImpl(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
     int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
     int16_t *in = BUF_S16(in_addr);
@@ -395,6 +805,71 @@ void aMixImpl(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr)
     }
 }
 
+#else
+
+static const ALIGN_ASSET(16) int16_t x7fff[8] = {
+    0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+};
+
+void aMixImpl(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
+    int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
+    int16_t* in = BUF_S16(in_addr);
+    int16_t* out = BUF_S16(out_addr);
+    int i;
+    int32_t sample;
+
+    if (gain == -0x8000) {
+        while (nbytes > 0) {
+            for (unsigned int i = 0; i < 2; i++) {
+                __m128i outVec = _mm_loadu_si128((__m128i*) out);
+                __m128i inVec = _mm_loadu_si128((__m128i*) in);
+                __m128i subsVec = _mm_subs_epi16(outVec, inVec);
+                _mm_storeu_si128((__m128i*) out, subsVec);
+                nbytes -= 8 * sizeof(int16_t);
+                in += 8;
+                out += 8;
+            }
+        }
+    }
+
+    __m128i x7fffVec = _mm_load_si128((__m128i*) x7fff);
+    __m128i x4000Vec = _mm_load_si128((__m128i*) x4000);
+    __m128i gainVec = _mm_set1_epi16(gain);
+
+    while (nbytes > 0) {
+        for (i = 0; i < 2; i++) {
+            // Load input and output data into vectors
+            __m128i outVec = _mm_loadu_si128((__m128i*) out);
+            __m128i inVec = _mm_loadu_si128((__m128i*) in);
+            // Multiply `out` by `0x7FFF` producing 32 bit results, and store the upper and lower bits in each vector.
+            // Equivalent to `out[0..8] * 0x7FFF`
+            m256i outx7fff = m256i_mul_epi16(outVec, x7fffVec);
+            // Same as above but for in and gain. Equivalent to `in[0..8] * gain`
+            m256i inxGain = m256i_mul_epi16(inVec, gainVec);
+            in += 8;
+
+            // Now we have 4 32 bit elements.  Continue the calculaton per the reference implementation.
+            // We already did out + 0x7fff and in * gain.
+            // *out * 0x7fff + *in++ * gain is the final result of these two calculations.
+            m256i addVec = m256i_add_m256i_epi32(outx7fff, inxGain);
+            // Add 0x4000
+            addVec = m256i_add_m128i_epi32(addVec, x4000Vec);
+            // Shift over by 15
+            m256i shiftedVec = m256i_srai(addVec, 15);
+            // Convert each 32 bit element to 16 bit with saturation (clamp) and store in `outVec`
+            outVec = m256i_clamp_to_m128i(shiftedVec);
+            // Write the final vector back to memory
+            // The final calculation is ((out[0..8] * 0x7fff + in[0..8] * gain) + 0x4000) >> 15;
+            _mm_storeu_si128((__m128i*) out, outVec);
+            out += 8;
+        }
+
+        nbytes -= 16 * sizeof(int16_t);
+    }
+}
+
+#endif
+
 void aS8DecImpl(uint8_t flags, ADPCM_STATE state) {
     uint8_t *in = BUF_U8(rspa.in);
     int16_t *out = BUF_S16(rspa.out);