TaskMatrix
https://github.com/chenfei-wu/TaskMatrix
TaskMatrix
TaskMatrix connects ChatGPT and a series of Visual Foundation Models to enable sending and receiving images during chatting.
Insight & Goal:
On the one hand, ChatGPT (or LLMs) serves as a general interface that provides a broad and diverse understanding of a wide range of topics. On the other hand, Foundation Models serve as domain experts by providing deep knowledge in specific domains. By leveraging both general and deep knowledge, we aim at building an AI that is capable of handling various tasks.
https://arxiv.org/pdf/2303.16434
https://github.com/chenfei-wu/TaskMatrix
class BackgroundRemoving: ''' using to remove the background of the given picture ''' template_model = True def __init__(self,VisualQuestionAnswering:VisualQuestionAnswering, Text2Box:Text2Box, Segmenting:Segmenting): self.vqa = VisualQuestionAnswering self.obj_segmenting = ObjectSegmenting(Text2Box,Segmenting) @prompts(name="Remove the background", description="useful when you want to extract the object or remove the background," "the input should be a string image_path" ) def inference(self, image_path): ''' given a image, return the picture only contains the extracted main object ''' updated_image_path = None mask = self.get_mask(image_path) image = Image.open(image_path) mask = Image.fromarray(mask) image.putalpha(mask) updated_image_path = get_new_image_name(image_path, func_name="detect-something") image.save(updated_image_path) return updated_image_path def get_mask(self, image_path): ''' Description: given an image path, return the mask of the main object. Args: image_path (string): the file path of the image Outputs: mask (numpy.ndarray): H x W ''' vqa_input = f"{image_path}, what is the main object in the image?" text_prompt = self.vqa.inference(vqa_input) mask = self.obj_segmenting.get_mask(image_path,text_prompt) return mask
class ConversationBot: def __init__(self, load_dict): # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...} print(f"Initializing VisualChatGPT, load_dict={load_dict}") if 'ImageCaptioning' not in load_dict: raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT") self.models = {} # Load Basic Foundation Models for class_name, device in load_dict.items(): self.models[class_name] = globals()[class_name](device=device) # Load Template Foundation Models for class_name, module in globals().items(): if getattr(module, 'template_model', False): template_required_names = {k for k in inspect.signature(module.__init__).parameters.keys() if k!='self'} loaded_names = set([type(e).__name__ for e in self.models.values()]) if template_required_names.issubset(loaded_names): self.models[class_name] = globals()[class_name]( **{name: self.models[name] for name in template_required_names}) print(f"All the Available Functions: {self.models}") self.tools = [] for instance in self.models.values(): for e in dir(instance): if e.startswith('inference'): func = getattr(instance, e) self.tools.append(Tool(name=func.name, description=func.description, func=func)) self.llm = OpenAI(temperature=0) self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output') def init_agent(self, lang): self.memory.clear() #clear previous history if lang=='English': PREFIX, FORMAT_INSTRUCTIONS, SUFFIX = VISUAL_CHATGPT_PREFIX, VISUAL_CHATGPT_FORMAT_INSTRUCTIONS, VISUAL_CHATGPT_SUFFIX place = "Enter text and press enter, or upload an image" label_clear = "Clear" else: PREFIX, FORMAT_INSTRUCTIONS, SUFFIX = VISUAL_CHATGPT_PREFIX_CN, VISUAL_CHATGPT_FORMAT_INSTRUCTIONS_CN, VISUAL_CHATGPT_SUFFIX_CN place = "輸入文字並回車,或者上傳圖片" label_clear = "清除" self.agent = initialize_agent( self.tools, self.llm, agent="conversational-react-description", verbose=True, memory=self.memory, return_intermediate_steps=True, agent_kwargs={'prefix': PREFIX, 'format_instructions': FORMAT_INSTRUCTIONS, 'suffix': SUFFIX}, ) return gr.update(visible = True), gr.update(visible = False), gr.update(placeholder=place), gr.update(value=label_clear) def run_text(self, text, state): self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500) res = self.agent({"input": text.strip()}) res['output'] = res['output'].replace("\\", "/") response = re.sub('(image/[-\w]*.png)', lambda m: f'![](file={m.group(0)})*{m.group(0)}*', res['output']) state = state + [(text, response)] print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n" f"Current Memory: {self.agent.memory.buffer}") return state, state def run_image(self, image, state, txt, lang): image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png") print("======>Auto Resize Image...") img = Image.open(image.name) width, height = img.size ratio = min(512 / width, 512 / height) width_new, height_new = (round(width * ratio), round(height * ratio)) width_new = int(np.round(width_new / 64.0)) * 64 height_new = int(np.round(height_new / 64.0)) * 64 img = img.resize((width_new, height_new)) img = img.convert('RGB') img.save(image_filename, "PNG") print(f"Resize image form {width}x{height} to {width_new}x{height_new}") description = self.models['ImageCaptioning'].inference(image_filename) if lang == 'Chinese': Human_prompt = f'\nHuman: 提供一張名為 {image_filename}的圖片。它的描述是: {description}。 這些資訊幫助你理解這個影像,但是你應該使用工具來完成下面的任務,而不是直接從我的描述中想象。 如果你明白了, 說 \"收到\". \n' AI_prompt = "收到。 " else: Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n' AI_prompt = "Received. " self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt state = state + [(f"![](file={image_filename})*{image_filename}*", AI_prompt)] print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n" f"Current Memory: {self.agent.memory.buffer}") return state, state, f'{txt} {image_filename} '
# clone the repo git clone https://github.com/microsoft/TaskMatrix.git # Go to directory cd visual-chatgpt # create a new environment conda create -n visgpt python=3.8 # activate the new environment conda activate visgpt # prepare the basic environments pip install -r requirements.txt pip install git+https://github.com/IDEA-Research/GroundingDINO.git pip install git+https://github.com/facebookresearch/segment-anything.git # prepare your private OpenAI key (for Linux) export OPENAI_API_KEY={Your_Private_Openai_Key} # prepare your private OpenAI key (for Windows) set OPENAI_API_KEY={Your_Private_Openai_Key} # Start TaskMatrix ! # You can specify the GPU/CPU assignment by "--load", the parameter indicates which # Visual Foundation Model to use and where it will be loaded to # The model and device are separated by underline '_', the different models are separated by comma ',' # The available Visual Foundation Models can be found in the following table # For example, if you want to load ImageCaptioning to cpu and Text2Image to cuda:0 # You can use: "ImageCaptioning_cpu,Text2Image_cuda:0" # Advice for CPU Users python visual_chatgpt.py --load ImageCaptioning_cpu,Text2Image_cpu # Advice for 1 Tesla T4 15GB (Google Colab) python visual_chatgpt.py --load "ImageCaptioning_cuda:0,Text2Image_cuda:0" # Advice for 4 Tesla V100 32GB python visual_chatgpt.py --load "Text2Box_cuda:0,Segmenting_cuda:0, Inpainting_cuda:0,ImageCaptioning_cuda:0, Text2Image_cuda:1,Image2Canny_cpu,CannyText2Image_cuda:1, Image2Depth_cpu,DepthText2Image_cuda:1,VisualQuestionAnswering_cuda:2, InstructPix2Pix_cuda:2,Image2Scribble_cpu,ScribbleText2Image_cuda:2, SegText2Image_cuda:2,Image2Pose_cpu,PoseText2Image_cuda:2, Image2Hed_cpu,HedText2Image_cuda:3,Image2Normal_cpu, NormalText2Image_cuda:3,Image2Line_cpu,LineText2Image_cuda:3"
https://zhuanlan.zhihu.com/p/618587935#:~:text=%E8%BF%99%E4%B8%AA%E6%9C%80%E6%96%B0%E7%9A%84%E7%A0%94%E7%A9%B6%E5%90%8D