mitsua-japanese-clip-vit-b-16 / configuration_mitsua_japanese_clip.py

Upload 15 files

f195a50 verified 20 days ago

2.04 kB

	# coding=utf-8
	# Copyright 2024 The HuggingFace Inc. team. + Abstract Engine. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	from transformers import PretrainedConfig, CLIPVisionConfig, SiglipTextConfig

	class MitsuaJapaneseCLIPConfig(PretrainedConfig):
	model_type = "mitsua_japanese_clip"

	def __init__(
	self,
	text_config=None, vision_config=None,
	projection_dim=512,
	logit_scale_init_value=2.6592,
	**kwargs,
	):
	super().__init__(**kwargs)
	if text_config is None:
	text_config = {}
	if vision_config is None:
	vision_config = {}

	self.vision_config = CLIPVisionConfig(**vision_config)
	self.text_config = SiglipTextConfig(**text_config)

	self.projection_dim = projection_dim
	self.logit_scale_init_value = logit_scale_init_value
	self.initializer_factor = 1.0


	@classmethod
	def from_vision_text_configs(
	cls, vision_config: PretrainedConfig, text_config: PretrainedConfig, **kwargs
	):
	r"""
	Instantiate a [`VisionTextDualEncoderConfig`] (or a derived class) from text model configuration and vision
	model configuration.
	Returns:
	[`VisionTextDualEncoderConfig`]: An instance of a configuration object
	"""

	return cls(
	vision_config=vision_config.to_dict(),
	text_config=text_config.to_dict(),
	**kwargs,
	)