move mouse to a specific ui element

fortred2 · March 14, 2025, 10:47am

Here's an implementation that uses OpenAI's new computer-use model with their new Responses API – it's pretty cool.

See docs here: https://platform.openai.com/docs/guides/tools-computer-use

I wrote this script quite quickly so I'll add improvements later... but it works!

In this demo, you type in a description of the element on your screen that you want to click on and then using vision AI your computer will automatically click on it.

Here's the script:


(async () => {
	let OPENAI_API_KEY = await get_keychain_item("BTT_OPENAI_API_KEY");

	let screenWidth = await get_number_variable({
		variable_name: "focused_screen_width",
	});

	let screenHeight = await get_number_variable({
		variable_name: "focused_screen_height",
	});

	let screenShotScript = `screencapture -cC`;

	await runShellScript({ script: screenShotScript });

	let clipboardBase64 = await get_clipboard_content({
		format: "public.png",
		asBase64: true,
	});

	// Get the prompt from BTT or use a default
	let prompt =
		(await get_string_variable({
			variable_name: "CLICK_ON_ELEMENT",
		})) || "the Apple logo";

	// Calculate target dimensions (similar to Python implementation)
	const MAX_WIDTH = 1728;
	let targetWidth = screenWidth;
	let targetHeight = screenHeight;

	if (screenWidth > MAX_WIDTH) {
		const scaleFactor = MAX_WIDTH / screenWidth;
		targetWidth = MAX_WIDTH;
		targetHeight = Math.round(screenHeight * scaleFactor);
	}

	// Prepare the API call
	const url = "https://api.openai.com/v1/responses";

	const headers = {
		Authorization: `Bearer ${OPENAI_API_KEY}`,
		"Content-Type": "application/json",
	};

	// Check for organization ID
	const OPENAI_ORG = await get_keychain_item("BTT_OPENAI_ORG");
	if (OPENAI_ORG) {
		headers["Openai-Organization"] = OPENAI_ORG;
	}

	// Prepare the input with screenshot and prompt
	const inputItems = [
		{
			role: "user",
			content: [
				{ type: "input_text", text: `Find and click on: ${prompt}` },
				{
					type: "input_image",
					image_url: `data:image/png;base64,${clipboardBase64}`,
				},
			],
		},
	];

	// Tools configuration
	const tools = [
		{
			type: "computer-preview",
			display_width: targetWidth,
			display_height: targetHeight,
			environment: "mac",
		},
	];

	// Prepare the payload
	const payload = {
		model: "computer-use-preview",
		input: inputItems,
		tools: tools,
		truncation: "auto",
	};

	try {
		const response = await fetch(url, {
			method: "POST",
			headers: headers,
			body: JSON.stringify(payload),
		});

		if (!response.ok) {
			throw new Error(
				`API Error: ${response.status} ${await response.text()}`
			);
		}

		const responseData = await response.json();

		// Extract click coordinates from the response
		try {
			const output = responseData.output || [];
			for (const item of output) {
				if (item.type === "computer_call") {
					const action = item.action || {};
					if (action.type === "click") {
						let waitMouseMoveDuration = 0.2;
						let moveMouseActionDefinition = {
							BTTActionCategory: 0,
							BTTIsPureAction: true,
							BTTPredefinedActionType: 153,
							BTTPredefinedActionName: "Move Mouse To Position",
							BTTAdditionalActionData: {
								BTTMouseMoveX: action.x,
								BTTMouseMoveAnchor: 0,
								BTTMouseMoveY: -action.y,
								BTTMouseMoveDuration: waitMouseMoveDuration,
							},
						};

						await trigger_action({
							json: JSON.stringify(moveMouseActionDefinition),
							wait_for_reply: true,
						});

						let waitActionDefinition = {
							BTTActionCategory: 0,
							BTTIsPureAction: true,
							BTTPredefinedActionType: 345,
							BTTPredefinedActionName:
								"Pause Execution  or  Delay Next Action (async  or  not blocking)",
							BTTDelayNextActionBy: String(
								waitMouseMoveDuration + 0.5
							),
						};

						await trigger_action({
							json: JSON.stringify(waitActionDefinition),
							wait_for_reply: true,
						});

						let clickActionDefinition = {
							BTTActionCategory: 0,
							BTTIsPureAction: true,
							BTTPredefinedActionType: 3,
							BTTPredefinedActionName:
								"Left Click (At Current Mouse Position)",
						};

						await trigger_action({
							json: JSON.stringify(clickActionDefinition),
							wait_for_reply: true,
						});

						// Return the coordinates if found
						returnToBTT(JSON.stringify(action));
						return;
					}
				}
			}
			// If we reach here, no click action was found
			console.log(`No click action found in response for: ${prompt}`);
			returnToBTT(
				JSON.stringify({
					success: false,
					error: `Could not identify ${prompt}`,
				})
			);
		} catch (e) {
			console.error(`Error extracting coordinates: ${e.message}`);
			returnToBTT(
				JSON.stringify({
					success: false,
					error: `Error extracting coordinates: ${e.message}`,
				})
			);
		}
	} catch (error) {
		returnToBTT(
			JSON.stringify({
				success: false,
				error: error.message,
			})
		);
	}
})();

Here's the BTT JSON config:

[
  {
    "BTTActionCategory" : 0,
    "BTTLastUpdatedAt" : 1741948279.2131031,
    "BTTTriggerType" : 0,
    "BTTTriggerClass" : "BTTTriggerTypeKeyboardShortcut",
    "BTTUUID" : "FAC970B7-47DE-4F73-85F9-6002E9EEBD77",
    "BTTPredefinedActionType" : 366,
    "BTTPredefinedActionName" : "Empty Placeholder",
    "BTTAdditionalConfiguration" : "8388608",
    "BTTKeyboardShortcutKeyboardType" : 2302,
    "BTTTriggerOnDown" : 1,
    "BTTLayoutIndependentChar" : "F4",
    "BTTEnabled" : 1,
    "BTTEnabled2" : 1,
    "BTTShortcutKeyCode" : 118,
    "BTTShortcutModifierKeys" : 8388608,
    "BTTOrder" : 10,
    "BTTAutoAdaptToKeyboardLayout" : 0,
    "BTTAdditionalActions" : [
      {
        "BTTActionCategory" : 0,
        "BTTLastUpdatedAt" : 1741948188.1062689,
        "BTTTriggerParentUUID" : "FAC970B7-47DE-4F73-85F9-6002E9EEBD77",
        "BTTIsPureAction" : true,
        "BTTTriggerClass" : "BTTTriggerTypeKeyboardShortcut",
        "BTTUUID" : "41899D96-6ACA-45B5-B8ED-06CE2A0B2D95",
        "BTTPredefinedActionType" : 403,
        "BTTPredefinedActionName" : "Ask For Input (Save To Variable)",
        "BTTAdditionalActionData" : {
          "BTTActionAskForInputOnlyHideOnEnterOrEsc" : 1,
          "BTTActionAskForInputVariableName" : "CLICK_ON_ELEMENT",
          "BTTActionAskForInputPrompt" : "What do you want to click on?"
        },
        "BTTKeyboardShortcutKeyboardType" : 0,
        "BTTEnabled" : 1,
        "BTTEnabled2" : 1,
        "BTTShortcutKeyCode" : -1,
        "BTTOrder" : 1,
        "BTTAutoAdaptToKeyboardLayout" : 0
      },
      {
        "BTTActionCategory" : 0,
        "BTTLastUpdatedAt" : 1741949678.9966969,
        "BTTTriggerParentUUID" : "FAC970B7-47DE-4F73-85F9-6002E9EEBD77",
        "BTTIsPureAction" : true,
        "BTTTriggerClass" : "BTTTriggerTypeKeyboardShortcut",
        "BTTUUID" : "9778EB56-8A76-4D86-A071-62F3C6D4B1C8",
        "BTTPredefinedActionType" : 281,
        "BTTPredefinedActionName" : "Run Real JavaScript",
        "BTTAdditionalActionData" : {
          "BTTScriptFunctionToCall" : "someJavaScriptFunction",
          "BTTJavaScriptUseIsolatedContext" : false,
          "BTTAppleScriptRunInBackground" : false,
          "BTTScriptType" : 3,
          "BTTAppleScriptString" : "(async () => {\n\tlet OPENAI_API_KEY = await get_keychain_item(\"BTT_OPENAI_API_KEY\");\n\n\tlet screenWidth = await get_number_variable({\n\t\tvariable_name: \"focused_screen_width\",\n\t});\n\n\tlet screenHeight = await get_number_variable({\n\t\tvariable_name: \"focused_screen_height\",\n\t});\n\n\tlet screenShotScript = `screencapture -cC`;\n\n\tawait runShellScript({ script: screenShotScript });\n\n\tlet clipboardBase64 = await get_clipboard_content({\n\t\tformat: \"public.png\",\n\t\tasBase64: true,\n\t});\n\n\t\/\/ Get the prompt from BTT or use a default\n\tlet prompt =\n\t\t(await get_string_variable({\n\t\t\tvariable_name: \"CLICK_ON_ELEMENT\",\n\t\t})) || \"the Apple logo\";\n\n\t\/\/ Calculate target dimensions (similar to Python implementation)\n\tconst MAX_WIDTH = 1728;\n\tlet targetWidth = screenWidth;\n\tlet targetHeight = screenHeight;\n\n\tif (screenWidth > MAX_WIDTH) {\n\t\tconst scaleFactor = MAX_WIDTH \/ screenWidth;\n\t\ttargetWidth = MAX_WIDTH;\n\t\ttargetHeight = Math.round(screenHeight * scaleFactor);\n\t}\n\n\t\/\/ Prepare the API call\n\tconst url = \"https:\/\/api.openai.com\/v1\/responses\";\n\n\tconst headers = {\n\t\tAuthorization: `Bearer ${OPENAI_API_KEY}`,\n\t\t\"Content-Type\": \"application\/json\",\n\t};\n\n\t\/\/ Check for organization ID\n\tconst OPENAI_ORG = await get_keychain_item(\"BTT_OPENAI_ORG\");\n\tif (OPENAI_ORG) {\n\t\theaders[\"Openai-Organization\"] = OPENAI_ORG;\n\t}\n\n\t\/\/ Prepare the input with screenshot and prompt\n\tconst inputItems = [\n\t\t{\n\t\t\trole: \"user\",\n\t\t\tcontent: [\n\t\t\t\t{ type: \"input_text\", text: `Find and click on: ${prompt}` },\n\t\t\t\t{\n\t\t\t\t\ttype: \"input_image\",\n\t\t\t\t\timage_url: `data:image\/png;base64,${clipboardBase64}`,\n\t\t\t\t},\n\t\t\t],\n\t\t},\n\t];\n\n\t\/\/ Tools configuration\n\tconst tools = [\n\t\t{\n\t\t\ttype: \"computer-preview\",\n\t\t\tdisplay_width: targetWidth,\n\t\t\tdisplay_height: targetHeight,\n\t\t\tenvironment: \"mac\",\n\t\t},\n\t];\n\n\t\/\/ Prepare the payload\n\tconst payload = {\n\t\tmodel: \"computer-use-preview\",\n\t\tinput: inputItems,\n\t\ttools: tools,\n\t\ttruncation: \"auto\",\n\t};\n\n\ttry {\n\t\tconst response = await fetch(url, {\n\t\t\tmethod: \"POST\",\n\t\t\theaders: headers,\n\t\t\tbody: JSON.stringify(payload),\n\t\t});\n\n\t\tif (!response.ok) {\n\t\t\tthrow new Error(\n\t\t\t\t`API Error: ${response.status} ${await response.text()}`\n\t\t\t);\n\t\t}\n\n\t\tconst responseData = await response.json();\n\n\t\t\/\/ Extract click coordinates from the response\n\t\ttry {\n\t\t\tconst output = responseData.output || [];\n\t\t\tfor (const item of output) {\n\t\t\t\tif (item.type === \"computer_call\") {\n\t\t\t\t\tconst action = item.action || {};\n\t\t\t\t\tif (action.type === \"click\") {\n\t\t\t\t\t\tlet waitMouseMoveDuration = 0.2;\n\t\t\t\t\t\tlet moveMouseActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 153,\n\t\t\t\t\t\t\tBTTPredefinedActionName: \"Move Mouse To Position\",\n\t\t\t\t\t\t\tBTTAdditionalActionData: {\n\t\t\t\t\t\t\t\tBTTMouseMoveX: action.x,\n\t\t\t\t\t\t\t\tBTTMouseMoveAnchor: 0,\n\t\t\t\t\t\t\t\tBTTMouseMoveY: -action.y,\n\t\t\t\t\t\t\t\tBTTMouseMoveDuration: waitMouseMoveDuration,\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(moveMouseActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tlet waitActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 345,\n\t\t\t\t\t\t\tBTTPredefinedActionName:\n\t\t\t\t\t\t\t\t\"Pause Execution  or  Delay Next Action (async  or  not blocking)\",\n\t\t\t\t\t\t\tBTTDelayNextActionBy: String(\n\t\t\t\t\t\t\t\twaitMouseMoveDuration + 0.5\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(waitActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tlet clickActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 3,\n\t\t\t\t\t\t\tBTTPredefinedActionName:\n\t\t\t\t\t\t\t\t\"Left Click (At Current Mouse Position)\",\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(clickActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\t\/\/ Return the coordinates if found\n\t\t\t\t\t\treturnToBTT(JSON.stringify(action));\n\t\t\t\t\t\treturn;\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t\/\/ If we reach here, no click action was found\n\t\t\tconsole.log(`No click action found in response for: ${prompt}`);\n\t\t\treturnToBTT(\n\t\t\t\tJSON.stringify({\n\t\t\t\t\tsuccess: false,\n\t\t\t\t\terror: `Could not identify ${prompt}`,\n\t\t\t\t})\n\t\t\t);\n\t\t} catch (e) {\n\t\t\tconsole.error(`Error extracting coordinates: ${e.message}`);\n\t\t\treturnToBTT(\n\t\t\t\tJSON.stringify({\n\t\t\t\t\tsuccess: false,\n\t\t\t\t\terror: `Error extracting coordinates: ${e.message}`,\n\t\t\t\t})\n\t\t\t);\n\t\t}\n\t} catch (error) {\n\t\treturnToBTT(\n\t\t\tJSON.stringify({\n\t\t\t\tsuccess: false,\n\t\t\t\terror: error.message,\n\t\t\t})\n\t\t);\n\t}\n})();\n",
          "BTTActionJSRunInSeparateContext" : false,
          "BTTAppleScriptUsePath" : false,
          "BTTScriptLocation" : 0
        },
        "BTTRealJavaScriptString" : "(async () => {\n\tlet OPENAI_API_KEY = await get_keychain_item(\"BTT_OPENAI_API_KEY\");\n\n\tlet screenWidth = await get_number_variable({\n\t\tvariable_name: \"focused_screen_width\",\n\t});\n\n\tlet screenHeight = await get_number_variable({\n\t\tvariable_name: \"focused_screen_height\",\n\t});\n\n\tlet screenShotScript = `screencapture -cC`;\n\n\tawait runShellScript({ script: screenShotScript });\n\n\tlet clipboardBase64 = await get_clipboard_content({\n\t\tformat: \"public.png\",\n\t\tasBase64: true,\n\t});\n\n\t\/\/ Get the prompt from BTT or use a default\n\tlet prompt =\n\t\t(await get_string_variable({\n\t\t\tvariable_name: \"CLICK_ON_ELEMENT\",\n\t\t})) || \"the Apple logo\";\n\n\t\/\/ Calculate target dimensions (similar to Python implementation)\n\tconst MAX_WIDTH = 1728;\n\tlet targetWidth = screenWidth;\n\tlet targetHeight = screenHeight;\n\n\tif (screenWidth > MAX_WIDTH) {\n\t\tconst scaleFactor = MAX_WIDTH \/ screenWidth;\n\t\ttargetWidth = MAX_WIDTH;\n\t\ttargetHeight = Math.round(screenHeight * scaleFactor);\n\t}\n\n\t\/\/ Prepare the API call\n\tconst url = \"https:\/\/api.openai.com\/v1\/responses\";\n\n\tconst headers = {\n\t\tAuthorization: `Bearer ${OPENAI_API_KEY}`,\n\t\t\"Content-Type\": \"application\/json\",\n\t};\n\n\t\/\/ Check for organization ID\n\tconst OPENAI_ORG = await get_keychain_item(\"BTT_OPENAI_ORG\");\n\tif (OPENAI_ORG) {\n\t\theaders[\"Openai-Organization\"] = OPENAI_ORG;\n\t}\n\n\t\/\/ Prepare the input with screenshot and prompt\n\tconst inputItems = [\n\t\t{\n\t\t\trole: \"user\",\n\t\t\tcontent: [\n\t\t\t\t{ type: \"input_text\", text: `Find and click on: ${prompt}` },\n\t\t\t\t{\n\t\t\t\t\ttype: \"input_image\",\n\t\t\t\t\timage_url: `data:image\/png;base64,${clipboardBase64}`,\n\t\t\t\t},\n\t\t\t],\n\t\t},\n\t];\n\n\t\/\/ Tools configuration\n\tconst tools = [\n\t\t{\n\t\t\ttype: \"computer-preview\",\n\t\t\tdisplay_width: targetWidth,\n\t\t\tdisplay_height: targetHeight,\n\t\t\tenvironment: \"mac\",\n\t\t},\n\t];\n\n\t\/\/ Prepare the payload\n\tconst payload = {\n\t\tmodel: \"computer-use-preview\",\n\t\tinput: inputItems,\n\t\ttools: tools,\n\t\ttruncation: \"auto\",\n\t};\n\n\ttry {\n\t\tconst response = await fetch(url, {\n\t\t\tmethod: \"POST\",\n\t\t\theaders: headers,\n\t\t\tbody: JSON.stringify(payload),\n\t\t});\n\n\t\tif (!response.ok) {\n\t\t\tthrow new Error(\n\t\t\t\t`API Error: ${response.status} ${await response.text()}`\n\t\t\t);\n\t\t}\n\n\t\tconst responseData = await response.json();\n\n\t\t\/\/ Extract click coordinates from the response\n\t\ttry {\n\t\t\tconst output = responseData.output || [];\n\t\t\tfor (const item of output) {\n\t\t\t\tif (item.type === \"computer_call\") {\n\t\t\t\t\tconst action = item.action || {};\n\t\t\t\t\tif (action.type === \"click\") {\n\t\t\t\t\t\tlet waitMouseMoveDuration = 0.2;\n\t\t\t\t\t\tlet moveMouseActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 153,\n\t\t\t\t\t\t\tBTTPredefinedActionName: \"Move Mouse To Position\",\n\t\t\t\t\t\t\tBTTAdditionalActionData: {\n\t\t\t\t\t\t\t\tBTTMouseMoveX: action.x,\n\t\t\t\t\t\t\t\tBTTMouseMoveAnchor: 0,\n\t\t\t\t\t\t\t\tBTTMouseMoveY: -action.y,\n\t\t\t\t\t\t\t\tBTTMouseMoveDuration: waitMouseMoveDuration,\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(moveMouseActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tlet waitActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 345,\n\t\t\t\t\t\t\tBTTPredefinedActionName:\n\t\t\t\t\t\t\t\t\"Pause Execution  or  Delay Next Action (async  or  not blocking)\",\n\t\t\t\t\t\t\tBTTDelayNextActionBy: String(\n\t\t\t\t\t\t\t\twaitMouseMoveDuration + 0.5\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(waitActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\tlet clickActionDefinition = {\n\t\t\t\t\t\t\tBTTActionCategory: 0,\n\t\t\t\t\t\t\tBTTIsPureAction: true,\n\t\t\t\t\t\t\tBTTPredefinedActionType: 3,\n\t\t\t\t\t\t\tBTTPredefinedActionName:\n\t\t\t\t\t\t\t\t\"Left Click (At Current Mouse Position)\",\n\t\t\t\t\t\t};\n\n\t\t\t\t\t\tawait trigger_action({\n\t\t\t\t\t\t\tjson: JSON.stringify(clickActionDefinition),\n\t\t\t\t\t\t\twait_for_reply: true,\n\t\t\t\t\t\t});\n\n\t\t\t\t\t\t\/\/ Return the coordinates if found\n\t\t\t\t\t\treturnToBTT(JSON.stringify(action));\n\t\t\t\t\t\treturn;\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t\/\/ If we reach here, no click action was found\n\t\t\tconsole.log(`No click action found in response for: ${prompt}`);\n\t\t\treturnToBTT(\n\t\t\t\tJSON.stringify({\n\t\t\t\t\tsuccess: false,\n\t\t\t\t\terror: `Could not identify ${prompt}`,\n\t\t\t\t})\n\t\t\t);\n\t\t} catch (e) {\n\t\t\tconsole.error(`Error extracting coordinates: ${e.message}`);\n\t\t\treturnToBTT(\n\t\t\t\tJSON.stringify({\n\t\t\t\t\tsuccess: false,\n\t\t\t\t\terror: `Error extracting coordinates: ${e.message}`,\n\t\t\t\t})\n\t\t\t);\n\t\t}\n\t} catch (error) {\n\t\treturnToBTT(\n\t\t\tJSON.stringify({\n\t\t\t\tsuccess: false,\n\t\t\t\terror: error.message,\n\t\t\t})\n\t\t);\n\t}\n})();\n",
        "BTTKeyboardShortcutKeyboardType" : 0,
        "BTTEnabled" : 1,
        "BTTEnabled2" : 1,
        "BTTShortcutKeyCode" : -1,
        "BTTOrder" : 2,
        "BTTAutoAdaptToKeyboardLayout" : 0
      }
    ]
  }
]