Tuesday, April 28, 2026

AI Agent to extract info from a static web page

 # STEP 1.1 INSTALL THE REQUIRED PACKAGES

!pip install langchain_community langchain_google_genai
!pip install -U duckduckgo-search

#you may see an error regarding version of request package ,
google colab requires a lower version.
You can right now ignore this message because we are not using google colab
libs in this notebook.






# STEP 1.2 IMPORT THE NECESSARY MODULES

# For loading the OpenAI API key safely into the colab environment
import os
from google.colab import userdata

# For Extracting the text from the URL
import requests
from bs4 import BeautifulSoup

# For creating the agent
#from langchain.agents import initialize_agent, Tool, AgentType
#from langchain.chat_models import ChatOpenAI

#### # STEP 1.3 LOAD THE OPEN AI API KEY
#### os.environ["OPENAI_API_KEY"] = userdata.get('openai')  
#### The original tutorial ised OPEN AI, but the OPENAI API KEY IS NOT FREE, HENCE WE WILL USE GEMINI.

# Retrieve the secret and set it as an environment variable
os.environ["GOOGLE_API_KEY"] = userdata.get('geminiapikey')

# Now LangChain will automatically find it
from langchain_google_genai import ChatGoogleGenerativeAI
#llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0)
#llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)
llm = ChatGoogleGenerativeAI(model="gemini-3-flash-preview", temperature=0)


# For creating the agent
#from langchain.agents import initialize_agent, Tool, AgentType
#from langchain.chat_models import ChatOpenAI


try:
    # A simple call to verify connectivity
    response = llm.invoke("Are you online? Answer with 'Yes' and your model version.")
    print(f"Response: {response.content}")
except Exception as e:
    print(f"Error: {e}")







# NEW 2026 IMPORTS
from langchain.agents import create_agent
from langchain_core.tools import tool

# 2. DEFINE THE WEB TOOL
@tool
def website_qa_tool(url_and_question: str) -> str:
    """
    Input format: 'url | question'.
    Scrapes a website and returns the content for answering questions.
    """
    try:
        url, question = url_and_question.split("|", 1)
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url.strip(), headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        for script in soup(["script", "style"]):
            script.extract()

        text = " ".join(soup.get_text().split())
        return text
        #return text[:25000] # Gemini 2.5 context handles this easily
    except Exception as e:
        return f"Scraping Error: {str(e)}"

# 3. INITIALIZE GEMINI


# 4. CREATE THE AGENT (Modern 2026 Method)
# This replaces create_react_agent AND AgentExecutor
agent = create_agent(
    model=llm,
    tools=[website_qa_tool],
    system_prompt="You are a research assistant. Use the website_qa_tool to find information from URLs."
)






# 5. RUN
print("\n--- Running Agent ---\n")
#query = "https://www.wikipedia.org | What is the main mission of Wikipedia?"
#query = "https://www.w3schools.com | Do they have any Angular course (NOT AngularJS) ?"
#query = "https://www.tutorialspoint.com | does it have any tutorial on DSA Learning  ? If yes please give link"
#query = "https://en.wikipedia.org/wiki/Arizona | How many private colleges and universities are there in Arizona State ?"
query = "https://en.wikipedia.org/wiki/Arizona | How many films were shot in Arizona State ?"

try:
    # In 2026, agents are run via .invoke() with a 'messages' list
    response = agent.invoke({"messages": [("user", query)]})

    print(response)
    # Get the last message from the agent's response
    final_answer = response["messages"][-1].content
    print("\n--- FINAL ANSWER ---")
    print(final_answer)
except Exception as e:
    print(f"\n❌ Agent Error: {e}")

No comments:

Post a Comment

AI Agent to extract info from a static web page

  # STEP 1.1 INSTALL THE REQUIRED PACKAGES ! pip install langchain_community langchain_google_genai ! pip install -U duckduckgo-search #you ...