-
Notifications
You must be signed in to change notification settings - Fork 0
/
chroma_embedding_export.json
155 lines (155 loc) · 32.8 KB
/
chroma_embedding_export.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
{
"ids": [
"default-app-id--0b62d5d1c32154693be28898480371b84219fb361f4a409fcac0bfd2ffbded73",
"default-app-id--193fcdcf91e5646ae1982f7928946847ca9d80bc37400d4b2a8b1b17752f1aab",
"default-app-id--1f503cd3edc9e37ce0786b8af77afb4911962e44ce29840fa6bd98907e2f43bd",
"default-app-id--278d9b9b4b8907a2e19692f9b077ee735b823270ae9d041e55fa90ee221da2ed",
"default-app-id--2923a50e4bfce3c4a393a7dbc53065901c4bf99241cb05ed83f7bf6a386d0083",
"default-app-id--3bd3abae8623cf2cd052c0aaa84d96919bb85b754b7280e9d368cee83e023a61",
"default-app-id--3edf83267900f73333fb55101b336bc1c3960255d9eef682d282d4c53278596c",
"default-app-id--5a4b5567d2e54053ba817515d8adcf01915a50e0d68e0b981ed28e201e22fa85",
"default-app-id--5f2024e0a677e777b42c49d4aa9a7886ee685d2b83575001cc428ee6a5ac77bc",
"default-app-id--8360665bd792898a3e1ed6fafbd941e078b5603cc4ca700e282d91cdec8935f7",
"default-app-id--8f2dd01691164d1d584adbfb18bca7501fdbd05efaf299c530403976db281e25",
"default-app-id--a5b8909ee61df5ec2c6a3d00dd9a896a15d3acd0ec0d0d1a08feb29a4fffae26",
"default-app-id--b0747a5f67ed11b598f9b657890f4229afe29cedf2fb6b36dfb137b712d45d59",
"default-app-id--c90a03ab83e8bf4028e6f267f506ac82085643f8eaf03eb46371a7dc030936e5",
"default-app-id--f1c60c6b94c3223080faeb77f2a614f0cdab82d0984345ffa0b05dfc130287a2",
"default-app-id--f1c9e5f62a1b1c34617f6cbdd98869644eccbff81c064e0f81ab03c8e7b188b2"
],
"embeddings": null,
"metadatas": [
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--ad61243ad807ca45486599e748cfd5023f5c0d396b3f264f5ce5c98c54f58a04",
"hash": "5c2dea634ecb2f293a90c026ddff33c7",
"url": "https://github.com/X-PLUG/MobileAgent"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--a68e07a59d970f7fc7b1f2ed0cda6ffc643e3cb3e433b91f8fff4c1d4aecf845",
"hash": "b63b166a68071a374ecb0fbdfc002250",
"url": "https://baai-agents.github.io/Cradle/"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--a68e07a59d970f7fc7b1f2ed0cda6ffc643e3cb3e433b91f8fff4c1d4aecf845",
"hash": "b63b166a68071a374ecb0fbdfc002250",
"url": "https://baai-agents.github.io/Cradle/"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--ef936da1cacfb3de15c2b7031d5d35476cfef8d9877562f83609fc9ac759491b",
"hash": "5db0da9c4c20cad3362b04358d11d8cf",
"url": "https://github.com/Envedity/DAIA"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--ad61243ad807ca45486599e748cfd5023f5c0d396b3f264f5ce5c98c54f58a04",
"hash": "5c2dea634ecb2f293a90c026ddff33c7",
"url": "https://github.com/X-PLUG/MobileAgent"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--ad61243ad807ca45486599e748cfd5023f5c0d396b3f264f5ce5c98c54f58a04",
"hash": "5c2dea634ecb2f293a90c026ddff33c7",
"url": "https://github.com/X-PLUG/MobileAgent"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--ef936da1cacfb3de15c2b7031d5d35476cfef8d9877562f83609fc9ac759491b",
"hash": "5db0da9c4c20cad3362b04358d11d8cf",
"url": "https://github.com/Envedity/DAIA"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--a68e07a59d970f7fc7b1f2ed0cda6ffc643e3cb3e433b91f8fff4c1d4aecf845",
"hash": "b63b166a68071a374ecb0fbdfc002250",
"url": "https://baai-agents.github.io/Cradle/"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--915b98e1771bde9f0f56bb986437c5f17207287b6b4f6235fc6b73552423ed4c",
"hash": "7c257b0315b71a347794e7ac766f6046",
"url": "https://raw.githubusercontent.com/James4Ever0/notes/master/Cybergod-like%20Agents%2C%20General%20Computer%20Control.md"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--a68e07a59d970f7fc7b1f2ed0cda6ffc643e3cb3e433b91f8fff4c1d4aecf845",
"hash": "b63b166a68071a374ecb0fbdfc002250",
"url": "https://baai-agents.github.io/Cradle/"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--a68e07a59d970f7fc7b1f2ed0cda6ffc643e3cb3e433b91f8fff4c1d4aecf845",
"hash": "b63b166a68071a374ecb0fbdfc002250",
"url": "https://baai-agents.github.io/Cradle/"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--ef936da1cacfb3de15c2b7031d5d35476cfef8d9877562f83609fc9ac759491b",
"hash": "5db0da9c4c20cad3362b04358d11d8cf",
"url": "https://github.com/Envedity/DAIA"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--a68e07a59d970f7fc7b1f2ed0cda6ffc643e3cb3e433b91f8fff4c1d4aecf845",
"hash": "b63b166a68071a374ecb0fbdfc002250",
"url": "https://baai-agents.github.io/Cradle/"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--a68e07a59d970f7fc7b1f2ed0cda6ffc643e3cb3e433b91f8fff4c1d4aecf845",
"hash": "b63b166a68071a374ecb0fbdfc002250",
"url": "https://baai-agents.github.io/Cradle/"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--915b98e1771bde9f0f56bb986437c5f17207287b6b4f6235fc6b73552423ed4c",
"hash": "7c257b0315b71a347794e7ac766f6046",
"url": "https://raw.githubusercontent.com/James4Ever0/notes/master/Cybergod-like%20Agents%2C%20General%20Computer%20Control.md"
},
{
"app_id": "default-app-id",
"data_type": "web_page",
"doc_id": "default-app-id--915b98e1771bde9f0f56bb986437c5f17207287b6b4f6235fc6b73552423ed4c",
"hash": "7c257b0315b71a347794e7ac766f6046",
"url": "https://raw.githubusercontent.com/James4Ever0/notes/master/Cybergod-like%20Agents%2C%20General%20Computer%20Control.md"
}
],
"documents": [
"GitHub - X-PLUG/MobileAgent: Mobile-Agent: The Powerful Mobile Device Operation Assistant Family Skip to content You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session. You switched accounts on another tab or window. Reload to refresh your session. Dismiss alert X-PLUG / MobileAgent Public Notifications You must be signed in to change notification settings Fork 226 Star 2.6k Mobile-Agent: The Powerful Mobile Device Operation Assistant Family arxiv.org/abs/2406.01014 License MIT license 2.6k stars 226 forks Branches Tags Activity Star Notifications You must be signed in to change notification settings X-PLUG/MobileAgent This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository. mainBranchesTagsGo to fileCodeFolders and filesNameNameLast commit messageLast commit dateLatest commit History112 CommitsMobile-Agent-v2Mobile-Agent-v2 Mobile-Agent-v3Mobile-Agent-v3 Mobile-AgentMobile-Agent assetsassets LICENSELICENSE README.mdREADME.md README_ja.mdREADME_ja.md README_zh.mdREADME_zh.md View all filesRepository files navigation Mobile-Agent: The Powerful Mobile Device Operation Assistant Family English | \u7b80\u4f53\u4e2d\u6587 | \u65e5\u672c\u8a9e \ud83d\udcfaDemo Mobile-Agent-v3 (Note: The video is not accelerated) YouTube Bilibili Mobile-Agent-v2 Mobile-Agent-v2.mp4 Mobile-Agent Mobile-Agent.mp4 \ud83d\udce2News \ud83d\udd25[7.29] Mobile-Agent won the best demo award at the The 23rd China National Conference on Computational Linguistics (CCL 2024). On the CCL 2024, we displayed the upcoming Mobile-Agent-v3. It has smaller memory overhead (8 GB), faster reasoning speed (10s-15s per operation), and all uses open source models. Video demo, please see the last section \ud83d\udcfaDemo. \ud83d\udd25[6.27] We proposed Demo that can upload mobile phone screenshots to experience Mobile-Agent-V2 in Hugging Face and ModelScope. You don\u2019t need to configure models and devices, and you can experience it immediately. [6. 4]",
"ANY computer task via the universal human-style interface by receiving input from screens and audio and outputting keyboard and mouse actions. There are many challenges to achieving GCC: i) good alignment across multi-modalities for better understanding and decision-making; ii) precise control of keyboard and mouse to interact with the computer, which has a large, hybrid action space, including not only which key to press and where the mouse to move, but also the duration of the press and the speed of the mouse movement; iii) long-horizontal reasoning due to the partial observability of complex GCC tasks, which also leads to the demand for long-term memory to maintain past useful experiences; and iv) efficient exploration in a structured manner to discover better strategies and solutions autonomously, i.e., self-improving, which can allow agents to generalize across the myriad tasks in the digital world. The Cradle Framework To pursue GCC, we propose Cradle, a modular and flexible LMM-powered framework that can properly handle the challenges GCC presents. The framework should have the ability to understand and interpret computer screens and dynamic changes between consecutive frames from arbitrary software and be able to generate reasonable computer control actions to be executed precisely. This suggests that a multimodal model with powerful vision and reasoning capabilities, in addition to rich knowledge of computer UI and control, is a requirement. In this work, we leverage GPT-4o as the framework's backbone model. Cradle is composed of six key modules: 1) information gathering to process multimodal input, 2) self-reflection to rethink past experiences, 3) task inference for choosing the best next task, 4) skill curation for generating and updating relevant skills for a given task, 5) action planning for deciding on specific executable actions for keyboard and mouse control, and 6) memory for storage and retrieval of past experiences and known skills.",
"struggles with accurately recognizing and locating objects near the player in the 2D game, leading to difficulties for the agent to interact with objects or people, as it requires the player to stand precisely in front of them in the grid (e.g., when entering doors, using a pickaxe to break stones). This explains the inefficiency in the farming task, although the agent manages to clear up most obstacles in front of the house within 100 steps, and poor performance in the shopping task. On the other hand, relying on episodic summarization and task inference, Cradle manages to obtain the parsnip by watering the seed for four days and harvesting. Dealer's Life 2: Cradle demonstrates robust performance and efficient profit-making on the weekly shop management task, successfully finalizing 93.6% of potential transactions, with an average of two negotiation rounds per customer, and generally aiming for a profit rate of over 50% at the initial offer. It consistently generates profit across all runs, maintaining a total profit rate of +39.6%, peaking at +87.4% in a single run. Software Application Software Application Result Multiple tasks remain challenging. Even with a well-known GUI, like Chrome and Outlook, GPT-4o still cannot recognize specific UI items to interact with and also struggles with visual context. For example, it may forget to press the Save button in an open dialog, or not distinguish between a nearby enabled button versus a distant and disabled one (e.g., when posting on Twitter). The phenomenon is more severe in UIs with non-standard layouts, like CapCut, Meitu, and Feishu. Lacking prior knowledge, GPT-4o fails in task inference and selecting the correct skills. OSWorld OSWorld Result Cradle achieves the overall highest success rate in OSWorld, compared to the baselines, at 7.81% without relying on any internal APIs to provide extra grounding labels, Set-of-Mark (SoM). Cradle's information gathering module improves grounding for more precise action",
"for interaction with the computer. Furthermore, the DAIA will be built with a built-in memory, self-evaluating and optimizing system from the start. Here is our current blueprint for the DAIA and its features: 2. Features \ud83c\udf1f Intelligent Interaction: DAIA can interact with your PC using natural language, making it easy to communicate your needs. (in progress) Goal Completion: Automate and complete goals (in progress) Task Automation: Automate repetitive tasks and processes, saving time and effort. (in progress) Information Retrieval: Get real-time information, answers, and data from the web. (in progress) Multi-Platform Support: DAIA is compatible with various operating systems and applications. (in progress) Customization: Tailor DAIA to your specific needs and preferences through custom scripts and plugins. (not done yet) Memory: Each action is saved into the memory, allowing you to start where you left. (in progress) Security: Ensure your data and interactions are secure with robust encryption and privacy measures. (not done yet) Endless Scalability: DAIA can make multiple copies of itself and create its own network of DAIAs provided the computing power. (in progress) Automation of Big Goals: DAIA can automate big goals with its capability to make multiple versions of itself, therefore making the process faster. (in progress) 3. Usage \ud83e\udd1d Install the DAIA by running git clone https:/github.com/Envedity/DAIA.git in your desired path, or by downloading and extracting the zip file. Make a python 3.11 env using the requirements.txt file. Run the DAIA.py file by typing python DAIA.py in the DAIA directory. Choose a version of the DAIA you want to use. Give it a goal. Let it know if you agree with what it suggests for goal compleation. Sit back and let the DAIA compleate your goal all by itself 4. Contribution \ud83d\ude4c We welcome contributions from the DAIA community to help improve and expand the capabilities of our AI agent. It is still in its early development stage so there",
"properties Stars 2.6k stars Watchers 42 watching Forks 226 forks Report repository Releases No releases published Packages 0 No packages published Contributors 5 Languages Python 100.0% You can\u2019t perform that action at this time.",
"Modelscope-Agent has supported Mobile-Agent-V2, based on Android Adb Env, please check in the application. [6. 4] We proposed Mobile-Agent-v2, a mobile device operation assistant with effective navigation via multi-agent collaboration. [3.10] Mobile-Agent has been accepted by the ICLR 2024 Workshop on Large Language Model (LLM) Agents. \ud83d\udcf1Version Mobile-Agent-v3 Mobile-Agent-v2 - Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration Mobile-Agent - Autonomous Multi-Modal Mobile Device Agent with Visual Perception \u2b50Star History \ud83d\udcd1Citation If you find Mobile-Agent useful for your research and applications, please cite using this BibTeX: @article{wang2024mobile2, title={Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration}, author={Wang, Junyang and Xu, Haiyang and Jia, Haitao and Zhang, Xi and Yan, Ming and Shen, Weizhou and Zhang, Ji and Huang, Fei and Sang, Jitao}, journal={arXiv preprint arXiv:2406.01014}, year={2024} } @article{wang2024mobile, title={Mobile-Agent: Autonomous Multi-Modal Mobile Device Agent with Visual Perception}, author={Wang, Junyang and Xu, Haiyang and Ye, Jiabo and Yan, Ming and Shen, Weizhou and Zhang, Ji and Huang, Fei and Sang, Jitao}, journal={arXiv preprint arXiv:2401.16158}, year={2024} } \ud83d\udce6Related Projects AppAgent: Multimodal Agents as Smartphone Users mPLUG-Owl & mPLUG-Owl2: Modularized Multimodal Large Language Model Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond GroundingDINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection CLIP: Contrastive Language-Image Pretraining About Mobile-Agent: The Powerful Mobile Device Operation Assistant Family arxiv.org/abs/2406.01014 Topics android agent harmony ios app gui automation mobile copilot multimodal mobile-agents mllm multimodal-large-language-models gpt4v multimodal-agent Resources Readme License MIT license Activity Custom",
"GitHub - Envedity/DAIA: Digital Artificial Intelligence Agent Skip to content You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session. You switched accounts on another tab or window. Reload to refresh your session. Dismiss alert Envedity / DAIA Public Notifications You must be signed in to change notification settings Fork 0 Star 2 Digital Artificial Intelligence Agent License GPL-3.0 license 2 stars 0 forks Branches Tags Activity Star Notifications You must be signed in to change notification settings Envedity/DAIA This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository. mainBranchesTagsGo to fileCodeFolders and filesNameNameLast commit messageLast commit dateLatest commit History28 Commits.github/workflows.github/workflows DAIA_GPT4VDAIA_GPT4V DesignDesign utilsutils .gitattributes.gitattributes .gitignore.gitignore DAIA.pyDAIA.py LICENCELICENCE PipfilePipfile README.mdREADME.md config.pyconfig.py requirements.txtrequirements.txt View all filesRepository files navigationDigital Artificial Intelligence Agent (V3) \ud83d\udc68\u200d\ud83d\udcbb\ud83e\udd16 DAIA is a powerful Digital Artificial Intelligence Agent that enables intelligent interactions with your PC to help you achieve your goals. Project Incomplete Table of Contents \ud83d\udcdc Introduction Features Usage Contribution License 1. Introduction \ud83d\ude80 DAIA is a cutting-edge AI agent designed to enhance your productivity by intelligently interacting with your computer and completing big goals. With DAIA, you can complete goals, automate tasks, gather information, and perform various operations seamlessly, just like you would do yourself. The main difference with the DAIA compared to other AI Agents is that it interacts with your computer through a vision system (GPT-4V) and task completion system allowing it to be capable of doing many more tasks and goals compared to other AI Agents that use the Terminal or CMD",
"Cradle: Empowering Foundation Agents Towards General Computer Control Cradle: Empowering Foundation Agents Towards General Computer Control Weihao Tan3 *, Wentao Zhang3 *, Xinrun Xu5 *, Haochong Xia3 \u2020, Ziluo Ding2 \u2020, Boyu Li3 \u2020, Bohan Zhou4 \u2020, Junpeng Yue4 \u2020, Jiechuan Jiang4 \u2020, Yewen Li3 \u2020, Ruyi An3 \u2020, Molei Qin3 \u2020, Chuqiao Zong3 \u2020, Longtao Zheng3 \u2020, YuJie Wu1 \u2020, Xiaoqiang Chai1 \u2020, Yifei Bi2, Tianbao Xie6, Pengjie Gu3, Xiyun Li2, Ceyao Zhang7, Long Tian1, Chaojie Wang1, Xinrun Wang3 \u2021, B\u00f6rje F. Karlsson2 \u2021, Bo An3, 1 \u00a7, Shuicheng Yan1 \u00a7, Zongqing Lu4, 2 \u00a7 1 Skywork AI 2 Beijing Academy of Artificial Intelligence 3 Nanyang Technological University, Singapore 4 Peking University 5 Institute of Software, Chinese Academy of Sciences 6 The University of Hong Kong 7 The Chinese University of Hong Kong, Shenzhen * Equal contribution \u2020 Core contribution \u2021 Equal advising \u00a7 Corresponding authors Paper arXiv Code The Cradle framework empowers nascent foundation models to perform complex computer tasks via the same general interface humans use: screen as input and keyboard & mouse operations as output. Abstract Despite the success in specific scenarios, existing foundation agents still struggle to generalize across various virtual scenarios, mainly due to the dramatically different encapsulations of environments with manually designed observation and action spaces. To handle this issue, we propose the General Computer Control (GCC) setting to restrict foundation agents to interact with software through the most unified and standardized interface, i.e., using screenshots as input and keyboard and mouse actions as output. We introduce Cradle, a modular and flexible LMM-powered framework, as a preliminary attempt towards GCC. Enhanced by six key modules: Information Gathering, Self-Reflection, Task Inference, Skill Curation, Action Planning, and Memory, Cradle is able to understand input screenshots and output executable code for low-level keyboard and mouse control after",
"game playing agents",
"Implementation Empirical Studies All Game Tasks All Software Application Tasks Major Results RDR2 RDR2 Result Cradle can achieve a high success rate in simple tasks like following an NPC or going to specific locations on the ground (e.g., Follow Dutch and Go to Barn). Another following task, Follow Javier, and the searching task, Search John, are dangerous for the rugged and winding path up to the snow mountain with cliffs. In addition, GPT-4o struggles with real-time combat tasks and searching tasks due to its inability to accurately locate enemies or objects and precisely time decisions. Even equipped with additional detection tools, like Grounding DINO (Liu et al., 2023), the success rate drops significantly to 20% in the task of Protect Dutch, which requires nighttime combat. Additionally, indoor tasks like Search for Supplies and Search Barn are also challenging due to GPT-4o's poor spatial perception, which finds it difficult to locate target objects and ends up circling aimlessly. The open-ended task, Buy Supply, shows that even without in-game guidance, Cradle still manages to complete the task with its superior reasoning ability. Other Games Cities: Skylines Stardew Valley Dealer's Life 2 Cities: Skylines Result Stardew Valley Result Dealer's Life 2 Result Cities: Skylines: While Cradle manages to build roads in a closed loop to ensure smooth traffic flow, place multiple wind turbines to provide sufficient electricity supply, and cover more than 90% of available area with residential, commercial, and industrial zones, it fails to provide sufficient water supply reliably. The most common failure case is that water pipes are not connected with each other, resulting in localized water shortages in the city, and preventing new residents from moving in. With human assistance to correct the mistakes within three unit operations, the city built by Cradle can eventually reach a population of more than one thousand. Stardew Valley: We surprisingly find that GPT-4o",
"high-level planning, so that Cradle can interact with any software and complete long-horizon complex tasks without relying on any built-in APIs. Experimental results show that Cradle exhibits remarkable generalizability and impressive performance across four previously unexplored commercial video games, five software applications, and a comprehensive benchmark, OSWorld. To our best knowledge, Cradle is the first to enable foundation agents to follow the main storyline and complete 40-minute-long real missions in the complex AAA game Red Dead Redemption 2 (RDR2). Cradle can also create a city of a thousand people in Cities:~Skylines, farm and harvest parsnips in Stardew Valley, and trade and bargain with a maximal weekly total profit of 87% in Dealer's Life 2. Cradle can not only operate daily software, like Chrome, Outlook, and Feishu, but also edit images and videos using Meitu and CapCut. With a unified interface to interact with any software, Cradle greatly extends the reach of foundation agents by enabling the easy conversion of any software, especially complex games, into benchmarks to evaluate agents' various abilities and facilitate further data collection, thus paving the way for generalist agents. Game Videos RDR2: Main Storyline RDR2: Open-ended World Stardew Valley Cities: Skylines Dealer's Life 2 Software Videos Chrome Outlook CapCut Meitu Feishu General Computer Control Computers, as the most important and universal interface that connects humans and the increasingly digital world, provide countless rich software, including applications and realistic video games for agents to interact with, while avoiding the challenges of robots in reality, such as hardware requirements, constraints of practicability, and possible catastrophic failures. Mastering these virtual environments is a promising path for foundation agents to achieve generalizability. Therefore, we propose the General Computer Control (GCC) setting: Building foundation agents that can master",
"is a lot to be done and we urgently need your support in this effort. Here's how you can contribute: Join Our Discord Server: If you're a developer or someone who is interested in contributing, please join our Discord server The Envedity Network at: https:/discord.gg/V4T6QFUw9c, there you can become a developer and will be able to directly contribute to the main DAIA repo with us, as well as share your feedback, suggestions, and bug reports with us and more. Your insights are valuable in shaping the future of DAIA Here is what we have already done from the blueprint: Feedback: Share your feedback, suggestions, and bug reports with us. You can do this by opening an issue on our feedback repository or in our Discord server. Share: Share the DAIA project with others you know to spread the word (DAIA repo link: https:/github.com/Envedity/DAIA) Support (Sponsor the project): You can sponsor us through this e-mail: envedity@gmail.com or donate/support us at https:/patreon.com/user?u=108155871&utm_medium=clipboard_copy&utm_source=copyLink&utm_campaign=creatorshare_creator&utm_content=join_link We appreciate your support in making DAIA even better for all users! \ud83d\ude4f Let's build the future of AGI Together! 5. License \ud83d\udcc4 DAIA is under the GNU Version 3 licence (https:/fsf.org/). For the most up-to-date information, visit DAIA's official website. \ud83c\udf10\ud83d\ude80 About Digital Artificial Intelligence Agent Topics machine-learning ai ml agi auto-agent ai-agent llm llm-agent ai-vision-model gpt4v gpt4vision Resources Readme License GPL-3.0 license Activity Custom properties Stars 2 stars Watchers 0 watching Forks 0 forks Report repository Releases No releases published Contributors 3 Languages Python 100.0% You can\u2019t perform that action at this time.",
"Tian and Chaojie Wang and Xinrun Wang and B\u00f6rje F. Karlsson and Bo An and Shuicheng Yan and Zongqing Lu}, journal={arXiv preprint arXiv:2403.03186}, year={2024} }",
"execution, increasing its performance. The self-reflection module greatly helps it to correctly predict infeasible tasks and subsequently fix mistakes, as exemplified in the professional domain results, where it achieves a 20.41% success rate, significantly surpassing the baselines. Conclusion In this work, we introduce GCC, a general and challenging setting with a unified and standard interface for control of diverse video games and other software (via screenshots, and keyboard and mouse operations), paving the way towards general foundation agents across all digital world tasks. To properly address the challenges GCC presents, we propose a novel open-source framework, Cradle, which exhibits strong performance in reasoning and performing actions to accomplish real missions or tasks in a set of complex video games and common software applications. To the best of our knowledge, Cradle is the first framework that enables foundation agents to succeed in such a diverse set of environments without relying on any built-in APIs. The success of Cradle greatly extends the reach of foundation agents and demonstrates the feasibility of converting any software, especially complex games, into benchmarks to evaluate agents' general intelligence and facilitate further data collection for self-improvement. AlthoughCradle can still face difficulties in certain tasks, it serves as a pioneering work to develop more powerful LMM-based general agents across computer control tasks, combining both further framework enhancements and new advances in LMMs. BibTeX @article{tan2024cradle, title={Cradle: Empowering Foundation Agents towards General Computer Control}, author={Weihao Tan and Wentao Zhang and Xinrun Xu and Haochong Xia and Ziluo Ding and Boyu Li and Bohan Zhou and Junpeng Yue and Jiechuan Jiang and Yewen Li and Ruyi An and Molei Qin and Chuqiao Zong and Longtao Zheng and Yujie Wu and Xiaoqiang Chai and Yifei Bi and Tianbao Xie and Pengjie Gu and Xiyun Li and Ceyao Zhang and Long",
"- title: Cybergod-like Agents, General Computer Control created: 2024-03-14T08:53:22+00:00 modified: 2024-07-08T16:16:16+08:00 - Cybergod-like Agents, General Computer Control matmul-free llm https:/arxiv.org/abs/2406.02528 - https:/github.com/KingNishHF/OpenGPT-4o [aider](https:/aider.chat/) coding assist devon alternative kyutai [moshi](https:/moshi.chat/) gpt4o alternative [firefunction v2](https:/hf-mirror.com/fireworks-ai/llama-3-firefunction-v2) function calling llm codegeex4-all-9b - https:/github.com/lavague-ai/LaVague https:/github.com/Upsonic/Tiger computer agents: https:/github.com/slavakurilyak/awesome-ai-agents - [gui agent model](https:/hf-mirror.com/shuaishuaicdp/GUI-Vid) trained on [gui-world](https:/github.com/Dongping-Chen/GUI-World) [gui agent datasets](https:/hf-mirror.com/search/full-text?q=gui+agent&type=dataset) on huggingface autocoder with pretrained models, has access to terminal: https:/github.com/bin123apple/AutoCoder - you can label the gui manually, write comments to each ui element and write exact operate steps about the exact execution steps. - GUI detection algorithm: https:/github.com/MulongXie/UIED - minified segment anything model: https:/github.com/xinghaochen/TinySAM - https:/github.com/graylan0/gptcomputer https:/github.com/patterns-complexity/gpt-pc-control https:/github.com/b5marwan/gpt-vision-agent https:/github.com/rogeriochaves/driver https:/github.com/s-a-ng/control-pc-with-gpt4-vision - gpt related: https:/github.com/szczyglis-dev/py-gpt https:/github.com/EwingYangs/awesome-open-gpt - gpt-4o is gaining popularity in computer control. https:/github.com/CK92149/GPTComputerAutomation https:/github.com/onuratakan/gpt-computer-assistant https:/github.com/kyegomez/GPT4o - terminal controlling agent: https:/github.com/greshake/Alice - Simulated computer control environments: https:/github.com/xlang-ai/OSWorld - Multi-agent framework, routing: https:/python.langchain.com/v0.1/docs/langgraph - Devin open source alternative:",
"https:/github.com/entropy-research/Devon https:/github.com/stitionai/devika https:/github.com/semanser/codel - Web browsing agent: https:/github.com/THUDM/AutoWebGLM - [Agent-Eval-Refine](https:/hf-mirror.com/Agent-Eval-Refine) contains models for [GUI captioning](https:/hf-mirror.com/Agent-Eval-Refine/Captioner), [iOS finetuned CogAgent](https:/hf-mirror.com/Agent-Eval-Refine/CogAgent-iOS-SelfTrain), and several GUI agent datasets. - [ScreenAgent](https:/github.com/niuzaisheng/ScreenAgent) includes a lots of related computer control papers and projects in, along with a self-trained model on huggingface. Similar projects: https:/github.com/TobiasNorlund/UI-Act Listed projects: https:/github.com/x-plug/mobileagent https:/github.com/google-research/google-research/tree/master/screen2words https:/github.com/rainyugg/blip-adapter https:/github.com/imnearth/coat https:/github.com/xbmxb/aagent https:/github.com/princeton-nlp/ptp https:/github.com/njucckevin/seeclick https:/github.com/thudm/autowebglm https:/github.com/OS-Copilot/OS-Copilot Environments: https:/github.com/google-deepmind/android_env https:/github.com/x-lance/mobile-env Datasets: https:/github.com/google-research-datasets/screen_qa - [Open-Interface](https:/github.com/AmberSahdev/Open-Interface) utilizes GPT-4V to control computer interface. - [Devin](https:/www.cognition-labs.com/) is an AI agent that can solve many real-world Github issues, with access to browser, terminal and code editor. [Cradle](https:/github.com/BAAI-Agents/Cradle) is a general computer controlling agent developed to play Red Dead Redeption II. [Pythagora](https:/github.com/Pythagora-io/gpt-pilot) aka GPT Pilot is a true AI developer that writes code, debugs it, talks to you when it need. - Devin open source counterparts: - [OpenDevin](https:/github.com/OpenDevin/OpenDevin) - [MetaGPT](https:/github.com/geekan/MetaGPT) - [SWE-Agent](https:/github.com/princeton-nlp/SWE-agent) - [GPA-LM](https:/github.com/BAAI-Agents/GPA-LM): a list of"
],
"uris": null,
"data": null
}