@@ -806,6 +806,25 @@ def whisper_log_callback(level, text, user_data):
806806 do_nothing = 0
807807
808808
809+ mtmd_log_callback = ctypes .CFUNCTYPE (None , ctypes .c_int , ctypes .c_char_p , ctypes .c_void_p )
810+
811+
812+ @mtmd_log_callback
813+ def mtmd_log_callback (level , text , user_data ):
814+
815+ """ Controls the display log output from mtmd engine - currently exposing two options 'ON' or 'OFF' """
816+
817+ # note: reserving level and user_data as options for the future
818+ # --adapted from more sophisticated logging mechanism in llama-cpp-python
819+ # --integrated with llama_cpp_verbose logging option for integrated debugging
820+
821+ if os .environ .get ("llama_cpp_verbose" ) != "OFF" :
822+ print (text .decode ("utf-8" ), end = "" , flush = True , file = sys .stderr )
823+ else :
824+ # no action taken if verbose is if OFF
825+ do_nothing = 0
826+
827+
809828class GGUFConfigs :
810829
811830 """GGUFConfigs is main global configuration object for GGUF Generative Models. Most of these config items
@@ -1089,3 +1108,176 @@ class whisper_full_params(ctypes.Structure):
10891108
10901109 ]
10911110
1111+
1112+ """ MTMD & CLIP GGUF Interface Configurations """
1113+
1114+ mtmd_context_p = NewType ("mtmd_context_p" , int )
1115+ mtmd_context_p_ctypes = ctypes .c_void_p
1116+
1117+ mtmd_bitmap_p = NewType ("mtmd_bitmap_p" , int )
1118+ mtmd_bitmap_p_ctypes = ctypes .c_void_p
1119+
1120+ mtmd_image_tokens_p = NewType ("mtmd_image_tokens_p" , int )
1121+ mtmd_image_tokens_p_ctypes = ctypes .c_void_p
1122+
1123+ mtmd_input_chunk_p = NewType ("mtmd_input_chunk_p" , int )
1124+ mtmd_input_chunk_p_ctypes = ctypes .c_void_p
1125+
1126+ mtmd_input_chunks_p = NewType ("mtmd_input_chunks_p" , int )
1127+ mtmd_input_chunks_p_ctypes = ctypes .c_void_p
1128+
1129+ MTMD_INPUT_CHUNK_TYPE_TEXT = 0
1130+ MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
1131+ MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
1132+
1133+
1134+ class mtmd_context_params (ctypes .Structure ):
1135+
1136+ """ This interface is linked to mtmd with releases b7062+, e.g., starting ~Nov 2025 """
1137+
1138+ # if errors, look at this interface in llama.cpp/tools/mtmd/mtmd.h
1139+ # -- this api has been evolving
1140+ # -- see also class below as drop-in replacement if using a mtmd lib from before Nov 2025
1141+
1142+ _fields_ = [
1143+ ("use_gpu" , ctypes .c_bool ),
1144+ ("print_timings" , ctypes .c_bool ),
1145+ ("n_threads" , ctypes .c_int ),
1146+ ("image_marker" , ctypes .c_char_p ),
1147+ ("media_marker" , ctypes .c_char_p ),
1148+
1149+ # verbosity removed in b7062
1150+ # ("verbosity", ctypes.c_int), # ggml_log_level
1151+
1152+ # new starting b6935
1153+ ("llama_flash_attn_type" , ctypes .c_int ),
1154+ ("warmup" , ctypes .c_bool ),
1155+ ("image_min_tokens" , ctypes .c_int ),
1156+ ("image_max_tokens" , ctypes .c_int ),
1157+ ("cb_eval_user_data" , ctypes .c_void_p ),
1158+ ("cb_eval" , ggml_backend_sched_eval_callback )
1159+ ]
1160+
1161+
1162+ class mtmd_context_params_alt_pre7062 (ctypes .Structure ):
1163+
1164+ """ This is a deprecated interface that maps to mtmd releases in second half of 2025, up
1165+ to the b7062 release in November 2025 """
1166+
1167+ _fields_ = [
1168+ ("use_gpu" , ctypes .c_bool ),
1169+ ("print_timings" , ctypes .c_bool ),
1170+ ("n_threads" , ctypes .c_int ),
1171+ ("verbosity" , ctypes .c_int ), # ggml_log_level
1172+ ("image_marker" , ctypes .c_char_p ),
1173+ ("media_marker" , ctypes .c_char_p )
1174+ ]
1175+
1176+
1177+ class mtmd_input_text (ctypes .Structure ):
1178+ _fields_ = [
1179+ ("text" , ctypes .c_char_p ),
1180+ ("add_special" , ctypes .c_bool ),
1181+ ("parse_special" , ctypes .c_bool ),
1182+ ]
1183+
1184+
1185+ def add_libmtmd_ctypes_declarations (_libmtmd ):
1186+
1187+ """ Main mtmd library interfaces """
1188+
1189+ mtmd_default_marker = _libmtmd .mtmd_default_marker
1190+ mtmd_default_marker .argtypes = []
1191+ mtmd_default_marker .restype = ctypes .c_char_p
1192+
1193+ mtmd_context_params_default = _libmtmd .mtmd_context_params_default
1194+ mtmd_context_params_default .argtypes = []
1195+ mtmd_context_params_default .restype = mtmd_context_params
1196+
1197+ mtmd_init_from_file = _libmtmd .mtmd_init_from_file
1198+ mtmd_init_from_file .argtypes = [ctypes .c_char_p , llama_model_p_ctypes , mtmd_context_params ]
1199+ mtmd_init_from_file .restype = mtmd_context_p_ctypes
1200+
1201+ mtmd_free = _libmtmd .mtmd_free
1202+ mtmd_free .argtypes = [mtmd_context_p_ctypes ]
1203+ mtmd_free .restype = None
1204+
1205+ mtmd_support_vision = _libmtmd .mtmd_support_vision
1206+ mtmd_support_vision .argtypes = [mtmd_context_p_ctypes ]
1207+ mtmd_support_vision .restype = ctypes .c_bool
1208+
1209+ mtmd_bitmap_init = _libmtmd .mtmd_bitmap_init
1210+ mtmd_bitmap_init .argtypes = [ctypes .c_uint32 , ctypes .c_uint32 , ctypes .POINTER (ctypes .c_uint8 )]
1211+ mtmd_bitmap_init .restype = mtmd_bitmap_p_ctypes
1212+
1213+ mtmd_bitmap_free = _libmtmd .mtmd_bitmap_free
1214+ mtmd_bitmap_free .argtypes = [mtmd_bitmap_p_ctypes ]
1215+ mtmd_bitmap_free .restype = None
1216+
1217+ mtmd_input_chunks_init = _libmtmd .mtmd_input_chunks_init
1218+ mtmd_input_chunks_init .argtypes = []
1219+ mtmd_input_chunks_init .restype = mtmd_input_chunks_p_ctypes
1220+
1221+ mtmd_input_chunks_free = _libmtmd .mtmd_input_chunks_free
1222+ mtmd_input_chunks_free .argtypes = [mtmd_input_chunks_p_ctypes ]
1223+ mtmd_input_chunks_free .restype = None
1224+
1225+ mtmd_input_chunks_size = _libmtmd .mtmd_input_chunks_size
1226+ mtmd_input_chunks_size .argtypes = [mtmd_input_chunks_p_ctypes ]
1227+ mtmd_input_chunks_size .restype = ctypes .c_size_t
1228+
1229+ mtmd_input_chunks_get = _libmtmd .mtmd_input_chunks_get
1230+ mtmd_input_chunks_get .argtypes = [mtmd_input_chunks_p_ctypes , ctypes .c_size_t ]
1231+ mtmd_input_chunks_get .restype = mtmd_input_chunk_p_ctypes
1232+
1233+ mtmd_tokenize = _libmtmd .mtmd_tokenize
1234+ mtmd_tokenize .argtypes = [mtmd_context_p_ctypes , mtmd_input_chunks_p_ctypes ,
1235+ ctypes .POINTER (mtmd_input_text ), ctypes .POINTER (mtmd_bitmap_p_ctypes ),
1236+ ctypes .c_size_t ]
1237+ mtmd_tokenize .restype = ctypes .c_int
1238+
1239+ mtmd_input_chunk_get_n_tokens = _libmtmd .mtmd_input_chunk_get_n_tokens
1240+ mtmd_input_chunk_get_n_tokens .argtypes = [mtmd_input_chunk_p_ctypes ]
1241+ mtmd_input_chunk_get_n_tokens .restype = ctypes .c_size_t
1242+
1243+ mtmd_input_chunk_get_type = _libmtmd .mtmd_input_chunk_get_type
1244+ mtmd_input_chunk_get_type .argtypes = [mtmd_input_chunk_p_ctypes ]
1245+ mtmd_input_chunk_get_type .restype = ctypes .c_int
1246+
1247+ mtmd_input_chunk_get_tokens_text = _libmtmd .mtmd_input_chunk_get_tokens_text
1248+ mtmd_input_chunk_get_tokens_text .argtypes = [mtmd_input_chunk_p_ctypes , ctypes .POINTER (ctypes .c_size_t )]
1249+ mtmd_input_chunk_get_tokens_text .restype = ctypes .POINTER (llama_token )
1250+
1251+ # mtmd_helper_bitmap_init_from_buf
1252+ mtmd_helper_bitmap_init_from_buf = _libmtmd .mtmd_helper_bitmap_init_from_buf
1253+ mtmd_helper_bitmap_init_from_buf .argtypes = [mtmd_context_p_ctypes , ctypes .POINTER (ctypes .c_uint8 ), ctypes .c_size_t ]
1254+ mtmd_helper_bitmap_init_from_buf .restype = mtmd_bitmap_p_ctypes
1255+
1256+ # mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname)
1257+ mtmd_helper_bitmap_init_from_file = _libmtmd .mtmd_helper_bitmap_init_from_file
1258+ mtmd_helper_bitmap_init_from_file .argtypes = [mtmd_context_p_ctypes , ctypes .c_char_p ]
1259+ mtmd_helper_bitmap_init_from_file .restype = mtmd_bitmap_p_ctypes
1260+
1261+ mtmd_helper_get_n_tokens = _libmtmd .mtmd_helper_get_n_tokens
1262+ mtmd_helper_get_n_tokens .argtypes = [mtmd_input_chunks_p_ctypes ]
1263+ mtmd_helper_get_n_tokens .restype = ctypes .c_size_t
1264+
1265+ mtmd_helper_eval_chunk_single = _libmtmd .mtmd_helper_eval_chunk_single
1266+ mtmd_helper_eval_chunk_single .argtypes = [mtmd_context_p_ctypes ,
1267+ llama_context_p_ctypes ,
1268+ mtmd_input_chunk_p_ctypes ,
1269+ llama_pos , llama_seq_id ,
1270+ ctypes .c_int , ctypes .c_bool , ctypes .POINTER (llama_pos )]
1271+ mtmd_helper_eval_chunk_single .restype = ctypes .c_int
1272+
1273+ # expose mtmd_helper_log_set - but catch if not found
1274+
1275+ try :
1276+ mtmd_helper_log_set = _libmtmd .mtmd_helper_log_set
1277+ mtmd_helper_log_set .argtypes = [ctypes .c_void_p , ctypes .c_void_p ]
1278+ mtmd_helper_log_set .restype = None
1279+ except :
1280+ pass
1281+
1282+ return _libmtmd
1283+
0 commit comments