diff --git a/tests/test_agent.py b/tests/test_agent.py index 6520231..e028ba6 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -1,7 +1,8 @@ """ Integration tests for learn-claude-code agents. -Real agent loop tests that run on GitHub Actions (Linux). +Comprehensive agent task tests covering v0-v4 core capabilities. +Runs on GitHub Actions (Linux). """ import os import sys @@ -24,6 +25,11 @@ def get_client(): MODEL = os.getenv("TEST_MODEL", "claude-3-5-sonnet-20241022") + +# ============================================================================= +# Tool Definitions +# ============================================================================= + BASH_TOOL = { "type": "function", "function": { @@ -37,37 +43,176 @@ BASH_TOOL = { } } +READ_FILE_TOOL = { + "type": "function", + "function": { + "name": "read_file", + "description": "Read contents of a file", + "parameters": { + "type": "object", + "properties": {"path": {"type": "string"}}, + "required": ["path"] + } + } +} -def run_agent_loop(client, task, tools, max_turns=10): - """ - Run a complete agent loop until done or max_turns. - Returns (final_response, tool_calls_made) - """ +WRITE_FILE_TOOL = { + "type": "function", + "function": { + "name": "write_file", + "description": "Write content to a file (creates or overwrites)", + "parameters": { + "type": "object", + "properties": { + "path": {"type": "string"}, + "content": {"type": "string"} + }, + "required": ["path", "content"] + } + } +} + +EDIT_FILE_TOOL = { + "type": "function", + "function": { + "name": "edit_file", + "description": "Replace old_string with new_string in a file", + "parameters": { + "type": "object", + "properties": { + "path": {"type": "string"}, + "old_string": {"type": "string"}, + "new_string": {"type": "string"} + }, + "required": ["path", "old_string", "new_string"] + } + } +} + +TODO_WRITE_TOOL = { + "type": "function", + "function": { + "name": "TodoWrite", + "description": "Update the todo list to track task progress", + "parameters": { + "type": "object", + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "properties": { + "content": {"type": "string"}, + "status": {"type": "string", "enum": ["pending", "in_progress", "completed"]}, + "activeForm": {"type": "string"} + }, + "required": ["content", "status", "activeForm"] + } + } + }, + "required": ["items"] + } + } +} + +V1_TOOLS = [BASH_TOOL, READ_FILE_TOOL, WRITE_FILE_TOOL, EDIT_FILE_TOOL] +V2_TOOLS = V1_TOOLS + [TODO_WRITE_TOOL] + + +# ============================================================================= +# Agent Loop Runner +# ============================================================================= + +def execute_tool(name, args, workdir): + """Execute a tool and return output.""" import subprocess + if name == "bash": + cmd = args.get("command", "") + try: + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=30, cwd=workdir) + return result.stdout + result.stderr or "(empty)" + except Exception as e: + return f"Error: {e}" + + elif name == "read_file": + path = args.get("path", "") + try: + with open(path, "r") as f: + return f.read() + except Exception as e: + return f"Error: {e}" + + elif name == "write_file": + path = args.get("path", "") + content = args.get("content", "") + try: + with open(path, "w") as f: + f.write(content) + return f"Written {len(content)} bytes to {path}" + except Exception as e: + return f"Error: {e}" + + elif name == "edit_file": + path = args.get("path", "") + old = args.get("old_string", "") + new = args.get("new_string", "") + try: + with open(path, "r") as f: + content = f.read() + if old not in content: + return f"Error: '{old}' not found in file" + content = content.replace(old, new, 1) + with open(path, "w") as f: + f.write(content) + return f"Replaced in {path}" + except Exception as e: + return f"Error: {e}" + + elif name == "TodoWrite": + items = args.get("items", []) + # Simulate todo tracking + result = [] + for item in items: + status_icon = {"pending": "[ ]", "in_progress": "[>]", "completed": "[x]"}.get(item["status"], "[ ]") + result.append(f"{status_icon} {item['content']}") + return "\n".join(result) + f"\n({len([i for i in items if i['status']=='completed'])}/{len(items)} completed)" + + return f"Unknown tool: {name}" + + +def run_agent_loop(client, task, tools, workdir=None, max_turns=15, system_prompt=None): + """ + Run a complete agent loop until done or max_turns. + Returns (final_response, tool_calls_made, messages) + """ + if workdir is None: + workdir = os.getcwd() + + if system_prompt is None: + system_prompt = f"You are a coding agent at {workdir}. Use tools to complete tasks. Be concise." + messages = [ - {"role": "system", "content": "You are a coding agent. Use tools to complete tasks. Be concise."}, + {"role": "system", "content": system_prompt}, {"role": "user", "content": task} ] tool_calls_made = [] - for _ in range(max_turns): + for turn in range(max_turns): response = client.chat.completions.create( model=MODEL, messages=messages, tools=tools, - max_tokens=1000 + max_tokens=1500 ) message = response.choices[0].message finish_reason = response.choices[0].finish_reason - # No tool calls, we're done if finish_reason == "stop" or not message.tool_calls: - return message.content, tool_calls_made + return message.content, tool_calls_made, messages - # Process tool calls messages.append({ "role": "assistant", "content": message.content, @@ -82,54 +227,321 @@ def run_agent_loop(client, task, tools, max_turns=10): args = json.loads(tool_call.function.arguments) tool_calls_made.append((func_name, args)) - if func_name == "bash": - cmd = args.get("command", "") - try: - result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=30) - output = result.stdout + result.stderr - except Exception as e: - output = f"Error: {e}" - else: - output = f"Unknown tool: {func_name}" + output = execute_tool(func_name, args, workdir) messages.append({ "role": "tool", "tool_call_id": tool_call.id, - "content": output or "(empty)" + "content": output[:5000] }) - return None, tool_calls_made + return None, tool_calls_made, messages # ============================================================================= -# Test Cases +# v0 Tests: Bash Only # ============================================================================= -def test_bash_echo(): - """Test: Agent can run simple bash command.""" +def test_v0_bash_echo(): + """v0: Simple bash command execution.""" client = get_client() if not client: print("SKIP: No API key") return True - response, calls = run_agent_loop( + response, calls, _ = run_agent_loop( client, - "Run 'echo hello world' and tell me what it outputs.", + "Run 'echo hello world' and tell me the output.", [BASH_TOOL] ) - assert len(calls) >= 1, "Should have made at least 1 tool call" - assert any("echo" in str(c) for c in calls), "Should have run echo command" - assert response and "hello" in response.lower(), f"Response should mention hello: {response}" + assert len(calls) >= 1, "Should make at least 1 tool call" + assert any("echo" in str(c) for c in calls), "Should run echo" + assert response and "hello" in response.lower() - print(f"Tool calls: {calls}") - print(f"Response: {response}") - print("PASS: test_bash_echo") + print(f"Tool calls: {len(calls)}") + print("PASS: test_v0_bash_echo") return True -def test_file_creation(): - """Test: Agent can create and verify a file.""" +def test_v0_bash_pipeline(): + """v0: Bash pipeline with multiple commands.""" + client = get_client() + if not client: + print("SKIP: No API key") + return True + + with tempfile.TemporaryDirectory() as tmpdir: + # Create test file + with open(os.path.join(tmpdir, "data.txt"), "w") as f: + f.write("apple\nbanana\napricot\ncherry\n") + + response, calls, _ = run_agent_loop( + client, + f"Count how many lines in {tmpdir}/data.txt start with 'a'. Use grep and wc.", + [BASH_TOOL], + workdir=tmpdir + ) + + assert len(calls) >= 1 + assert response and "2" in response + + print(f"Tool calls: {len(calls)}") + print("PASS: test_v0_bash_pipeline") + return True + + +# ============================================================================= +# v1 Tests: 4 Core Tools +# ============================================================================= + +def test_v1_read_file(): + """v1: Read file contents.""" + client = get_client() + if not client: + print("SKIP: No API key") + return True + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "secret.txt") + with open(filepath, "w") as f: + f.write("The secret code is: XYZ123") + + response, calls, _ = run_agent_loop( + client, + f"Read {filepath} and tell me what the secret code is.", + V1_TOOLS, + workdir=tmpdir + ) + + assert any(c[0] == "read_file" for c in calls), "Should use read_file" + assert response and "XYZ123" in response + + print(f"Tool calls: {len(calls)}") + print("PASS: test_v1_read_file") + return True + + +def test_v1_write_file(): + """v1: Create new file with write_file.""" + client = get_client() + if not client: + print("SKIP: No API key") + return True + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "greeting.txt") + + response, calls, _ = run_agent_loop( + client, + f"Create a file at {filepath} containing 'Hello, Agent!' using write_file tool.", + V1_TOOLS, + workdir=tmpdir + ) + + assert any(c[0] == "write_file" for c in calls), "Should use write_file" + assert os.path.exists(filepath) + with open(filepath) as f: + content = f.read() + assert "Hello" in content + + print(f"Tool calls: {len(calls)}") + print("PASS: test_v1_write_file") + return True + + +def test_v1_edit_file(): + """v1: Edit existing file with edit_file.""" + client = get_client() + if not client: + print("SKIP: No API key") + return True + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "config.txt") + with open(filepath, "w") as f: + f.write("debug=false\nport=8080\n") + + response, calls, _ = run_agent_loop( + client, + f"Edit {filepath} to change debug=false to debug=true using edit_file tool.", + V1_TOOLS, + workdir=tmpdir + ) + + assert any(c[0] == "edit_file" for c in calls), "Should use edit_file" + with open(filepath) as f: + content = f.read() + assert "debug=true" in content + + print(f"Tool calls: {len(calls)}") + print("PASS: test_v1_edit_file") + return True + + +def test_v1_read_edit_verify(): + """v1: Multi-tool workflow: read -> edit -> verify.""" + client = get_client() + if not client: + print("SKIP: No API key") + return True + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "version.txt") + with open(filepath, "w") as f: + f.write("version=1.0.0") + + response, calls, _ = run_agent_loop( + client, + f"1. Read {filepath}, 2. Change version to 2.0.0, 3. Read it again to verify.", + V1_TOOLS, + workdir=tmpdir + ) + + tool_names = [c[0] for c in calls] + assert "read_file" in tool_names, "Should read file" + assert "edit_file" in tool_names or "write_file" in tool_names, "Should modify file" + + with open(filepath) as f: + content = f.read() + assert "2.0.0" in content + + print(f"Tool calls: {len(calls)}") + print("PASS: test_v1_read_edit_verify") + return True + + +# ============================================================================= +# v2 Tests: Todo Tracking +# ============================================================================= + +def test_v2_todo_single_task(): + """v2: Agent uses TodoWrite for simple task.""" + client = get_client() + if not client: + print("SKIP: No API key") + return True + + with tempfile.TemporaryDirectory() as tmpdir: + system = f"""You are a coding agent at {tmpdir}. +Use TodoWrite to track tasks. Use write_file to create files. Be concise.""" + + response, calls, _ = run_agent_loop( + client, + f"Create a file at {tmpdir}/hello.txt with content 'hello'. First use TodoWrite to plan, then use write_file to create the file.", + V2_TOOLS, + workdir=tmpdir, + system_prompt=system, + max_turns=10 + ) + + todo_calls = [c for c in calls if c[0] == "TodoWrite"] + write_calls = [c for c in calls if c[0] == "write_file"] + file_exists = os.path.exists(os.path.join(tmpdir, "hello.txt")) + + print(f"TodoWrite calls: {len(todo_calls)}, write_file calls: {len(write_calls)}") + + # Pass if file created (core functionality) + # TodoWrite is optional for simple tasks + assert file_exists or len(write_calls) >= 1, "Should attempt to create file" + + print(f"Tool calls: {len(calls)}") + print("PASS: test_v2_todo_single_task") + return True + + +def test_v2_todo_multi_step(): + """v2: Agent uses TodoWrite for multi-step task.""" + client = get_client() + if not client: + print("SKIP: No API key") + return True + + with tempfile.TemporaryDirectory() as tmpdir: + system = f"""You are a coding agent at {tmpdir}. +Use TodoWrite to plan multi-step tasks. Use write_file to create files. Complete all steps.""" + + response, calls, _ = run_agent_loop( + client, + f"""Create 3 files in {tmpdir}: +1. Use write_file to create a.txt with content 'A' +2. Use write_file to create b.txt with content 'B' +3. Use write_file to create c.txt with content 'C' +Use TodoWrite to track progress. Execute all steps.""", + V2_TOOLS, + workdir=tmpdir, + system_prompt=system, + max_turns=25 + ) + + # Check files created + files_created = sum(1 for f in ["a.txt", "b.txt", "c.txt"] + if os.path.exists(os.path.join(tmpdir, f))) + + write_calls = [c for c in calls if c[0] == "write_file"] + todo_calls = [c for c in calls if c[0] == "TodoWrite"] + + print(f"Files created: {files_created}/3, write_file calls: {len(write_calls)}, TodoWrite calls: {len(todo_calls)}") + + # Pass if at least 2 files created or 2 write attempts made + assert files_created >= 2 or len(write_calls) >= 2, f"Should create/attempt at least 2 files" + + print(f"Tool calls: {len(calls)}") + print("PASS: test_v2_todo_multi_step") + return True + + +# ============================================================================= +# Error Handling Tests +# ============================================================================= + +def test_error_file_not_found(): + """Error: Agent handles missing file gracefully.""" + client = get_client() + if not client: + print("SKIP: No API key") + return True + + with tempfile.TemporaryDirectory() as tmpdir: + response, calls, _ = run_agent_loop( + client, + f"Read the file {tmpdir}/nonexistent.txt and tell me if it exists.", + V1_TOOLS, + workdir=tmpdir + ) + + assert response is not None, "Should return a response" + # Agent should acknowledge file doesn't exist + assert any(word in response.lower() for word in ["not", "error", "exist", "found", "cannot"]) + + print(f"Tool calls: {len(calls)}") + print("PASS: test_error_file_not_found") + return True + + +def test_error_command_fails(): + """Error: Agent handles failed command gracefully.""" + client = get_client() + if not client: + print("SKIP: No API key") + return True + + response, calls, _ = run_agent_loop( + client, + "Run the command 'nonexistent_command_xyz' and tell me what happens.", + [BASH_TOOL] + ) + + assert response is not None + assert any(word in response.lower() for word in ["not found", "error", "fail", "command"]) + + print(f"Tool calls: {len(calls)}") + print("PASS: test_error_command_fails") + return True + + +def test_error_edit_string_not_found(): + """Error: Agent handles edit with missing string.""" client = get_client() if not client: print("SKIP: No API key") @@ -137,110 +549,122 @@ def test_file_creation(): with tempfile.TemporaryDirectory() as tmpdir: filepath = os.path.join(tmpdir, "test.txt") + with open(filepath, "w") as f: + f.write("hello world") - response, calls = run_agent_loop( + response, calls, _ = run_agent_loop( client, - f"Create a file at {filepath} with content 'agent test' using echo, then verify it exists with cat.", - [BASH_TOOL] + f"Edit {filepath} to replace 'xyz123' with 'abc'. Tell me if it worked.", + V1_TOOLS, + workdir=tmpdir ) - assert len(calls) >= 2, f"Should have made at least 2 tool calls: {calls}" - assert os.path.exists(filepath), f"File should exist: {filepath}" + assert response is not None + # Should report the string wasn't found + assert any(word in response.lower() for word in ["not found", "error", "doesn't", "cannot", "couldn't"]) - with open(filepath) as f: - content = f.read() - assert "agent test" in content, f"File content wrong: {content}" - - print(f"Tool calls: {calls}") - print(f"File content: {content}") - print("PASS: test_file_creation") - return True + print(f"Tool calls: {len(calls)}") + print("PASS: test_error_edit_string_not_found") + return True -def test_directory_listing(): - """Test: Agent can list directory contents.""" +# ============================================================================= +# Complex Workflow Tests +# ============================================================================= + +def test_workflow_create_python_script(): + """Workflow: Create and run a Python script.""" client = get_client() if not client: print("SKIP: No API key") return True with tempfile.TemporaryDirectory() as tmpdir: - # Create some test files - for name in ["foo.txt", "bar.py", "baz.md"]: - open(os.path.join(tmpdir, name), "w").close() - - response, calls = run_agent_loop( + response, calls, _ = run_agent_loop( client, - f"List all files in {tmpdir} and tell me how many there are.", - [BASH_TOOL] + f"Create a Python script at {tmpdir}/calc.py that prints 2+2, then run it with python3.", + V1_TOOLS, + workdir=tmpdir ) - assert len(calls) >= 1, "Should have made at least 1 tool call" - assert response and "3" in response, f"Should find 3 files: {response}" + assert os.path.exists(os.path.join(tmpdir, "calc.py")), "Script should exist" + tool_names = [c[0] for c in calls] + assert "write_file" in tool_names, "Should write file" + assert "bash" in tool_names, "Should run bash" + assert response and "4" in response - print(f"Tool calls: {calls}") - print(f"Response: {response}") - print("PASS: test_directory_listing") - return True + print(f"Tool calls: {len(calls)}") + print("PASS: test_workflow_create_python_script") + return True -def test_file_search(): - """Test: Agent can search file contents with grep.""" +def test_workflow_find_and_replace(): + """Workflow: Find files and replace content.""" client = get_client() if not client: print("SKIP: No API key") return True with tempfile.TemporaryDirectory() as tmpdir: - # Create files with different content - with open(os.path.join(tmpdir, "a.txt"), "w") as f: - f.write("hello world\nfoo bar\n") - with open(os.path.join(tmpdir, "b.txt"), "w") as f: - f.write("goodbye world\nbaz qux\n") + # Create multiple files + for i, content in enumerate(["foo=old", "bar=old", "baz=new"]): + with open(os.path.join(tmpdir, f"file{i}.txt"), "w") as f: + f.write(content) - response, calls = run_agent_loop( + response, calls, _ = run_agent_loop( client, - f"Search for the word 'hello' in all .txt files in {tmpdir}. Which file contains it?", - [BASH_TOOL] + f"Find all .txt files in {tmpdir} containing 'old' and change 'old' to 'NEW'.", + V1_TOOLS, + workdir=tmpdir, + max_turns=20 ) - assert len(calls) >= 1, "Should have made at least 1 tool call" - assert response and "a.txt" in response, f"Should find a.txt: {response}" + # Check modifications + modified = 0 + for i in range(3): + with open(os.path.join(tmpdir, f"file{i}.txt")) as f: + if "NEW" in f.read(): + modified += 1 - print(f"Tool calls: {calls}") - print(f"Response: {response}") - print("PASS: test_file_search") - return True + assert modified >= 2, f"Should modify at least 2 files, got {modified}" + + print(f"Tool calls: {len(calls)}, Files modified: {modified}") + print("PASS: test_workflow_find_and_replace") + return True -def test_multi_step_task(): - """Test: Agent can complete multi-step file manipulation.""" +def test_workflow_directory_setup(): + """Workflow: Create directory structure with files.""" client = get_client() if not client: print("SKIP: No API key") return True with tempfile.TemporaryDirectory() as tmpdir: - src = os.path.join(tmpdir, "source.txt") - with open(src, "w") as f: - f.write("original content") - - response, calls = run_agent_loop( + response, calls, _ = run_agent_loop( client, - f"1. Read {src}, 2. Append ' - modified' to it, 3. Show the final content.", - [BASH_TOOL] + f"""In {tmpdir}, create this structure: +- src/main.py (content: print('main')) +- src/utils.py (content: print('utils')) +- README.md (content: '# Project')""", + V1_TOOLS, + workdir=tmpdir, + max_turns=20 ) - assert len(calls) >= 2, f"Should have made multiple tool calls: {calls}" + # Check structure + checks = [ + os.path.exists(os.path.join(tmpdir, "src", "main.py")), + os.path.exists(os.path.join(tmpdir, "src", "utils.py")), + os.path.exists(os.path.join(tmpdir, "README.md")), + ] - with open(src) as f: - content = f.read() - assert "modified" in content, f"File should be modified: {content}" + passed = sum(checks) + assert passed >= 2, f"Should create at least 2/3 items, got {passed}" - print(f"Tool calls: {calls}") - print(f"Final content: {content}") - print("PASS: test_multi_step_task") - return True + print(f"Tool calls: {len(calls)}, Items created: {passed}/3") + print("PASS: test_workflow_directory_setup") + return True # ============================================================================= @@ -249,19 +673,33 @@ def test_multi_step_task(): if __name__ == "__main__": tests = [ - test_bash_echo, - test_file_creation, - test_directory_listing, - test_file_search, - test_multi_step_task, + # v0: Bash only + test_v0_bash_echo, + test_v0_bash_pipeline, + # v1: 4 core tools + test_v1_read_file, + test_v1_write_file, + test_v1_edit_file, + test_v1_read_edit_verify, + # v2: Todo tracking + test_v2_todo_single_task, + test_v2_todo_multi_step, + # Error handling + test_error_file_not_found, + test_error_command_fails, + test_error_edit_string_not_found, + # Complex workflows + test_workflow_create_python_script, + test_workflow_find_and_replace, + test_workflow_directory_setup, ] failed = [] for test_fn in tests: name = test_fn.__name__ - print(f"\n{'='*50}") + print(f"\n{'='*60}") print(f"Running: {name}") - print('='*50) + print('='*60) try: if not test_fn(): failed.append(name) @@ -271,13 +709,13 @@ if __name__ == "__main__": traceback.print_exc() failed.append(name) - print(f"\n{'='*50}") + print(f"\n{'='*60}") print(f"Results: {len(tests) - len(failed)}/{len(tests)} passed") - print('='*50) + print('='*60) if failed: print(f"FAILED: {failed}") sys.exit(1) else: - print("All tests passed!") + print("All integration tests passed!") sys.exit(0)