Here's an implementation that uses OpenAI's new computer-use
model with their new Responses API – it's pretty cool.
See docs here: https://platform.openai.com/docs/guides/tools-computer-use
I wrote this script quite quickly so I'll add improvements later... but it works!
In this demo, you type in a description of the element on your screen that you want to click on and then using vision AI your computer will automatically click on it.
Here's the script:
(async () => {
let OPENAI_API_KEY = await get_keychain_item("BTT_OPENAI_API_KEY");
let screenWidth = await get_number_variable({
variable_name: "focused_screen_width",
});
let screenHeight = await get_number_variable({
variable_name: "focused_screen_height",
});
let screenShotScript = `screencapture -cC`;
await runShellScript({ script: screenShotScript });
let clipboardBase64 = await get_clipboard_content({
format: "public.png",
asBase64: true,
});
// Get the prompt from BTT or use a default
let prompt =
(await get_string_variable({
variable_name: "CLICK_ON_ELEMENT",
})) || "the Apple logo";
// Calculate target dimensions (similar to Python implementation)
const MAX_WIDTH = 1728;
let targetWidth = screenWidth;
let targetHeight = screenHeight;
if (screenWidth > MAX_WIDTH) {
const scaleFactor = MAX_WIDTH / screenWidth;
targetWidth = MAX_WIDTH;
targetHeight = Math.round(screenHeight * scaleFactor);
}
// Prepare the API call
const url = "https://api.openai.com/v1/responses";
const headers = {
Authorization: `Bearer ${OPENAI_API_KEY}`,
"Content-Type": "application/json",
};
// Check for organization ID
const OPENAI_ORG = await get_keychain_item("BTT_OPENAI_ORG");
if (OPENAI_ORG) {
headers["Openai-Organization"] = OPENAI_ORG;
}
// Prepare the input with screenshot and prompt
const inputItems = [
{
role: "user",
content: [
{ type: "input_text", text: `Find and click on: ${prompt}` },
{
type: "input_image",
image_url: `data:image/png;base64,${clipboardBase64}`,
},
],
},
];
// Tools configuration
const tools = [
{
type: "computer-preview",
display_width: targetWidth,
display_height: targetHeight,
environment: "mac",
},
];
// Prepare the payload
const payload = {
model: "computer-use-preview",
input: inputItems,
tools: tools,
truncation: "auto",
};
try {
const response = await fetch(url, {
method: "POST",
headers: headers,
body: JSON.stringify(payload),
});
if (!response.ok) {
throw new Error(
`API Error: ${response.status} ${await response.text()}`
);
}
const responseData = await response.json();
// Extract click coordinates from the response
try {
const output = responseData.output || [];
for (const item of output) {
if (item.type === "computer_call") {
const action = item.action || {};
if (action.type === "click") {
let waitMouseMoveDuration = 0.2;
let moveMouseActionDefinition = {
BTTActionCategory: 0,
BTTIsPureAction: true,
BTTPredefinedActionType: 153,
BTTPredefinedActionName: "Move Mouse To Position",
BTTAdditionalActionData: {
BTTMouseMoveX: action.x,
BTTMouseMoveAnchor: 0,
BTTMouseMoveY: -action.y,
BTTMouseMoveDuration: waitMouseMoveDuration,
},
};
await trigger_action({
json: JSON.stringify(moveMouseActionDefinition),
wait_for_reply: true,
});
let waitActionDefinition = {
BTTActionCategory: 0,
BTTIsPureAction: true,
BTTPredefinedActionType: 345,
BTTPredefinedActionName:
"Pause Execution or Delay Next Action (async or not blocking)",
BTTDelayNextActionBy: String(
waitMouseMoveDuration + 0.5
),
};
await trigger_action({
json: JSON.stringify(waitActionDefinition),
wait_for_reply: true,
});
let clickActionDefinition = {
BTTActionCategory: 0,
BTTIsPureAction: true,
BTTPredefinedActionType: 3,
BTTPredefinedActionName:
"Left Click (At Current Mouse Position)",
};
await trigger_action({
json: JSON.stringify(clickActionDefinition),
wait_for_reply: true,
});
// Return the coordinates if found
returnToBTT(JSON.stringify(action));
return;
}
}
}
// If we reach here, no click action was found
console.log(`No click action found in response for: ${prompt}`);
returnToBTT(
JSON.stringify({
success: false,
error: `Could not identify ${prompt}`,
})
);
} catch (e) {
console.error(`Error extracting coordinates: ${e.message}`);
returnToBTT(
JSON.stringify({
success: false,
error: `Error extracting coordinates: ${e.message}`,
})
);
}
} catch (error) {
returnToBTT(
JSON.stringify({
success: false,
error: error.message,
})
);
}
})();
Here's the BTT JSON config:
[
{
"BTTActionCategory" : 0,
"BTTLastUpdatedAt" : 1741948279.2131031,
"BTTTriggerType" : 0,
"BTTTriggerClass" : "BTTTriggerTypeKeyboardShortcut",
"BTTUUID" : "FAC970B7-47DE-4F73-85F9-6002E9EEBD77",
"BTTPredefinedActionType" : 366,
"BTTPredefinedActionName" : "Empty Placeholder",
"BTTAdditionalConfiguration" : "8388608",
"BTTKeyboardShortcutKeyboardType" : 2302,
"BTTTriggerOnDown" : 1,
"BTTLayoutIndependentChar" : "F4",
"BTTEnabled" : 1,
"BTTEnabled2" : 1,
"BTTShortcutKeyCode" : 118,
"BTTShortcutModifierKeys" : 8388608,
"BTTOrder" : 10,
"BTTAutoAdaptToKeyboardLayout" : 0,
"BTTAdditionalActions" : [
{
"BTTActionCategory" : 0,
"BTTLastUpdatedAt" : 1741948188.1062689,
"BTTTriggerParentUUID" : "FAC970B7-47DE-4F73-85F9-6002E9EEBD77",
"BTTIsPureAction" : true,
"BTTTriggerClass" : "BTTTriggerTypeKeyboardShortcut",
"BTTUUID" : "41899D96-6ACA-45B5-B8ED-06CE2A0B2D95",
"BTTPredefinedActionType" : 403,
"BTTPredefinedActionName" : "Ask For Input (Save To Variable)",
"BTTAdditionalActionData" : {
"BTTActionAskForInputOnlyHideOnEnterOrEsc" : 1,
"BTTActionAskForInputVariableName" : "CLICK_ON_ELEMENT",
"BTTActionAskForInputPrompt" : "What do you want to click on?"
},
"BTTKeyboardShortcutKeyboardType" : 0,
"BTTEnabled" : 1,
"BTTEnabled2" : 1,
"BTTShortcutKeyCode" : -1,
"BTTOrder" : 1,
"BTTAutoAdaptToKeyboardLayout" : 0
},
{
"BTTActionCategory" : 0,
"BTTLastUpdatedAt" : 1741949678.9966969,
"BTTTriggerParentUUID" : "FAC970B7-47DE-4F73-85F9-6002E9EEBD77",
"BTTIsPureAction" : true,
"BTTTriggerClass" : "BTTTriggerTypeKeyboardShortcut",
"BTTUUID" : "9778EB56-8A76-4D86-A071-62F3C6D4B1C8",
"BTTPredefinedActionType" : 281,
"BTTPredefinedActionName" : "Run Real JavaScript",
"BTTAdditionalActionData" : {
"BTTScriptFunctionToCall" : "someJavaScriptFunction",
"BTTJavaScriptUseIsolatedContext" : false,
"BTTAppleScriptRunInBackground" : false,
"BTTScriptType" : 3,
"BTTAppleScriptString" : "(async () => {\n\tlet OPENAI_API_KEY = await get_keychain_item(\"BTT_OPENAI_API_KEY\");\n\n\tlet screenWidth = await get_number_variable({\n\t\tvariable_name: \"focused_screen_width\",\n\t});\n\n\tlet screenHeight = await get_number_variable({\n\t\tvariable_name: \"focused_screen_height\",\n\t});\n\n\tlet screenShotScript = `screencapture -cC`;\n\n\tawait runShellScript({ script: screenShotScript });\n\n\tlet clipboardBase64 = await get_clipboard_content({\n\t\tformat: \"public.png\",\n\t\tasBase64: true,\n\t});\n\n\t\/\/ Get the prompt from BTT or use a default\n\tlet prompt =\n\t\t(await get_string_variable({\n\t\t\tvariable_name: \"CLICK_ON_ELEMENT\",\n\t\t})) || \"the Apple logo\";\n\n\t\/\/ Calculate target dimensions (similar to Python implementation)\n\tconst MAX_WIDTH = 1728;\n\tlet targetWidth = screenWidth;\n\tlet targetHeight = screenHeight;\n\n\tif (screenWidth > MAX_WIDTH) {\n\t\tconst scaleFactor = MAX_WIDTH \/ screenWidth;\n\t\ttargetWidth = MAX_WIDTH;\n\t\ttargetHeight = Math.round(screenHeight * scaleFactor);\n\t}\n\n\t\/\/ Prepare the API call\n\tconst url = \"https:\/\/api.openai.com\/v1\/responses\";\n\n\tconst headers = {\n\t\tAuthorization: `Bearer ${OPENAI_API_KEY}`,\n\t\t\"Content-Type\": \"application\/json\",\n\t};\n\n\t\/\/ Check for organization ID\n\tconst OPENAI_ORG = await get_keychain_item(\"BTT_OPENAI_ORG\");\n\tif (OPENAI_ORG) {\n\t\theaders[\"Openai-Organization\"] = OPENAI_ORG;\n\t}\n\n\t\/\/ Prepare the input with screenshot and prompt\n\tconst inputItems = [\n\t\t{\n\t\t\trole: \"user\",\n\t\t\tcontent: [\n\t\t\t\t{ type: \"input_text\", text: `Find and click on: ${prompt}` },\n\t\t\t\t{\n\t\t\t\t\ttype: \"input_image\",\n\t\t\t\t\timage_url: `data:image\/png;base64,${clipboardBase64}`,\n\t\t\t\t},\n\t\t\t],\n\t\t},\n\t];\n\n\t\/\/ Tools configuration\n\tconst tools = [\n\t\t{\n\t\t\ttype: \"computer-preview\",\n\t\t\tdisplay_width: targetWidth,\n\t\t\tdisplay_height: targetHeight,\n\t\t\tenvironment: \"mac\",\n\t\t},\n\t];\n\n\t\/\/ Prepare the payload\n\tconst payload = {\n\t\tmodel: \"computer-use-preview\",\n\t\tinput: inputItems,\n\t\ttools: tools,\n\t\ttruncation: \"auto\",\n\t};\n\n\ttry {\n\t\tconst response = await fetch(url, {\n\t\t\tmethod: \"POST\",\n\t\t\theaders: headers,\n\t\t\tbody: JSON.stringify(payload),\n\t\t});\n\n\t\tif (!response.ok) {\n\t\t\tthrow new Error(\n\t\t\t\t`API Error: ${response.status} ${await response.text()}`\n\t\t\t);\n\t\t}\n\n\t\tconst responseData = await response.json();\n\n\t\t\/\/ Extract click coordinates from the response\n\t\ttry {\n\t\t\tconst output = responseData.output || [];\n\t\t\tfor (const item of output) {\n\t\t\t\tif (item.type === \"computer_call\") {\n\t\t\t\t\tconst action = item.action || {};\n\t\t\t\t\tif (action.type === \"click\") {\n\t\t\t\t\t\tlet waitMouseMoveDuration = 0.2;\n\t\t\t\t\t\tlet moveMouseActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 153,\n\t\t\t\t\t\t\tBTTPredefinedActionName: \"Move Mouse To Position\",\n\t\t\t\t\t\t\tBTTAdditionalActionData: {\n\t\t\t\t\t\t\t\tBTTMouseMoveX: action.x,\n\t\t\t\t\t\t\t\tBTTMouseMoveAnchor: 0,\n\t\t\t\t\t\t\t\tBTTMouseMoveY: -action.y,\n\t\t\t\t\t\t\t\tBTTMouseMoveDuration: waitMouseMoveDuration,\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(moveMouseActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tlet waitActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 345,\n\t\t\t\t\t\t\tBTTPredefinedActionName:\n\t\t\t\t\t\t\t\t\"Pause Execution or Delay Next Action (async or not blocking)\",\n\t\t\t\t\t\t\tBTTDelayNextActionBy: String(\n\t\t\t\t\t\t\t\twaitMouseMoveDuration + 0.5\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(waitActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tlet clickActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 3,\n\t\t\t\t\t\t\tBTTPredefinedActionName:\n\t\t\t\t\t\t\t\t\"Left Click (At Current Mouse Position)\",\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(clickActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\t\/\/ Return the coordinates if found\n\t\t\t\t\t\treturnToBTT(JSON.stringify(action));\n\t\t\t\t\t\treturn;\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t\/\/ If we reach here, no click action was found\n\t\t\tconsole.log(`No click action found in response for: ${prompt}`);\n\t\t\treturnToBTT(\n\t\t\t\tJSON.stringify({\n\t\t\t\t\tsuccess: false,\n\t\t\t\t\terror: `Could not identify ${prompt}`,\n\t\t\t\t})\n\t\t\t);\n\t\t} catch (e) {\n\t\t\tconsole.error(`Error extracting coordinates: ${e.message}`);\n\t\t\treturnToBTT(\n\t\t\t\tJSON.stringify({\n\t\t\t\t\tsuccess: false,\n\t\t\t\t\terror: `Error extracting coordinates: ${e.message}`,\n\t\t\t\t})\n\t\t\t);\n\t\t}\n\t} catch (error) {\n\t\treturnToBTT(\n\t\t\tJSON.stringify({\n\t\t\t\tsuccess: false,\n\t\t\t\terror: error.message,\n\t\t\t})\n\t\t);\n\t}\n})();\n",
"BTTActionJSRunInSeparateContext" : false,
"BTTAppleScriptUsePath" : false,
"BTTScriptLocation" : 0
},
"BTTRealJavaScriptString" : "(async () => {\n\tlet OPENAI_API_KEY = await get_keychain_item(\"BTT_OPENAI_API_KEY\");\n\n\tlet screenWidth = await get_number_variable({\n\t\tvariable_name: \"focused_screen_width\",\n\t});\n\n\tlet screenHeight = await get_number_variable({\n\t\tvariable_name: \"focused_screen_height\",\n\t});\n\n\tlet screenShotScript = `screencapture -cC`;\n\n\tawait runShellScript({ script: screenShotScript });\n\n\tlet clipboardBase64 = await get_clipboard_content({\n\t\tformat: \"public.png\",\n\t\tasBase64: true,\n\t});\n\n\t\/\/ Get the prompt from BTT or use a default\n\tlet prompt =\n\t\t(await get_string_variable({\n\t\t\tvariable_name: \"CLICK_ON_ELEMENT\",\n\t\t})) || \"the Apple logo\";\n\n\t\/\/ Calculate target dimensions (similar to Python implementation)\n\tconst MAX_WIDTH = 1728;\n\tlet targetWidth = screenWidth;\n\tlet targetHeight = screenHeight;\n\n\tif (screenWidth > MAX_WIDTH) {\n\t\tconst scaleFactor = MAX_WIDTH \/ screenWidth;\n\t\ttargetWidth = MAX_WIDTH;\n\t\ttargetHeight = Math.round(screenHeight * scaleFactor);\n\t}\n\n\t\/\/ Prepare the API call\n\tconst url = \"https:\/\/api.openai.com\/v1\/responses\";\n\n\tconst headers = {\n\t\tAuthorization: `Bearer ${OPENAI_API_KEY}`,\n\t\t\"Content-Type\": \"application\/json\",\n\t};\n\n\t\/\/ Check for organization ID\n\tconst OPENAI_ORG = await get_keychain_item(\"BTT_OPENAI_ORG\");\n\tif (OPENAI_ORG) {\n\t\theaders[\"Openai-Organization\"] = OPENAI_ORG;\n\t}\n\n\t\/\/ Prepare the input with screenshot and prompt\n\tconst inputItems = [\n\t\t{\n\t\t\trole: \"user\",\n\t\t\tcontent: [\n\t\t\t\t{ type: \"input_text\", text: `Find and click on: ${prompt}` },\n\t\t\t\t{\n\t\t\t\t\ttype: \"input_image\",\n\t\t\t\t\timage_url: `data:image\/png;base64,${clipboardBase64}`,\n\t\t\t\t},\n\t\t\t],\n\t\t},\n\t];\n\n\t\/\/ Tools configuration\n\tconst tools = [\n\t\t{\n\t\t\ttype: \"computer-preview\",\n\t\t\tdisplay_width: targetWidth,\n\t\t\tdisplay_height: targetHeight,\n\t\t\tenvironment: \"mac\",\n\t\t},\n\t];\n\n\t\/\/ Prepare the payload\n\tconst payload = {\n\t\tmodel: \"computer-use-preview\",\n\t\tinput: inputItems,\n\t\ttools: tools,\n\t\ttruncation: \"auto\",\n\t};\n\n\ttry {\n\t\tconst response = await fetch(url, {\n\t\t\tmethod: \"POST\",\n\t\t\theaders: headers,\n\t\t\tbody: JSON.stringify(payload),\n\t\t});\n\n\t\tif (!response.ok) {\n\t\t\tthrow new Error(\n\t\t\t\t`API Error: ${response.status} ${await response.text()}`\n\t\t\t);\n\t\t}\n\n\t\tconst responseData = await response.json();\n\n\t\t\/\/ Extract click coordinates from the response\n\t\ttry {\n\t\t\tconst output = responseData.output || [];\n\t\t\tfor (const item of output) {\n\t\t\t\tif (item.type === \"computer_call\") {\n\t\t\t\t\tconst action = item.action || {};\n\t\t\t\t\tif (action.type === \"click\") {\n\t\t\t\t\t\tlet waitMouseMoveDuration = 0.2;\n\t\t\t\t\t\tlet moveMouseActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 153,\n\t\t\t\t\t\t\tBTTPredefinedActionName: \"Move Mouse To Position\",\n\t\t\t\t\t\t\tBTTAdditionalActionData: {\n\t\t\t\t\t\t\t\tBTTMouseMoveX: action.x,\n\t\t\t\t\t\t\t\tBTTMouseMoveAnchor: 0,\n\t\t\t\t\t\t\t\tBTTMouseMoveY: -action.y,\n\t\t\t\t\t\t\t\tBTTMouseMoveDuration: waitMouseMoveDuration,\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(moveMouseActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tlet waitActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 345,\n\t\t\t\t\t\t\tBTTPredefinedActionName:\n\t\t\t\t\t\t\t\t\"Pause Execution or Delay Next Action (async or not blocking)\",\n\t\t\t\t\t\t\tBTTDelayNextActionBy: String(\n\t\t\t\t\t\t\t\twaitMouseMoveDuration + 0.5\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(waitActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tlet clickActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 3,\n\t\t\t\t\t\t\tBTTPredefinedActionName:\n\t\t\t\t\t\t\t\t\"Left Click (At Current Mouse Position)\",\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(clickActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\t\/\/ Return the coordinates if found\n\t\t\t\t\t\treturnToBTT(JSON.stringify(action));\n\t\t\t\t\t\treturn;\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t\/\/ If we reach here, no click action was found\n\t\t\tconsole.log(`No click action found in response for: ${prompt}`);\n\t\t\treturnToBTT(\n\t\t\t\tJSON.stringify({\n\t\t\t\t\tsuccess: false,\n\t\t\t\t\terror: `Could not identify ${prompt}`,\n\t\t\t\t})\n\t\t\t);\n\t\t} catch (e) {\n\t\t\tconsole.error(`Error extracting coordinates: ${e.message}`);\n\t\t\treturnToBTT(\n\t\t\t\tJSON.stringify({\n\t\t\t\t\tsuccess: false,\n\t\t\t\t\terror: `Error extracting coordinates: ${e.message}`,\n\t\t\t\t})\n\t\t\t);\n\t\t}\n\t} catch (error) {\n\t\treturnToBTT(\n\t\t\tJSON.stringify({\n\t\t\t\tsuccess: false,\n\t\t\t\terror: error.message,\n\t\t\t})\n\t\t);\n\t}\n})();\n",
"BTTKeyboardShortcutKeyboardType" : 0,
"BTTEnabled" : 1,
"BTTEnabled2" : 1,
"BTTShortcutKeyCode" : -1,
"BTTOrder" : 2,
"BTTAutoAdaptToKeyboardLayout" : 0
}
]
}
]