# ================================
# STEP 1: RAW DATASET
# ================================
documents = [
# --- Pod Failures ---
"CrashLoopBackOff occurs when a container repeatedly crashes after starting.",
"A container may crash due to missing environment variables.",
"Incorrect command or entrypoint can cause container startup failure.",
"Application errors inside the container often lead to restarts.",
"OOMKilled happens when a container exceeds its memory limit.",
# --- Image Issues ---
"ImagePullBackOff occurs when Kubernetes cannot pull the container image.",
"Incorrect image name or tag can cause image pull failures.",
"Private registries require imagePullSecrets for authentication.",
# --- Debugging Commands ---
"kubectl logs retrieves logs from a running container.",
"kubectl describe pod shows events and state transitions.",
# --- Scheduling Issues ---
"Pods remain pending if no node satisfies resource requests.",
"Node affinity restricts pods to specific nodes.",
"Taints prevent pods from being scheduled on certain nodes.",
"Tolerations allow pods to be scheduled on tainted nodes.",
# --- Probes ---
"Liveness probes determine if a container should be restarted.",
"Readiness probes determine if a pod can receive traffic.",
"A failing readiness probe removes the pod from service endpoints.",
# --- Networking ---
"ClusterIP services expose applications within the cluster.",
"NodePort services expose applications on node IPs.",
# --- Storage ---
"PersistentVolumes provide storage independent of pods.",
"PersistentVolumeClaims request storage resources.",
# --- Configuration ---
"ConfigMaps store non-sensitive configuration data.",
"Secrets store sensitive data like passwords and tokens.",
# --- Deployment & Scaling ---
"Deployments manage replica sets and pod updates.",
"Horizontal Pod Autoscaler scales based on CPU or metrics.",
# --- Misc ---
"Pods stuck in Terminating state may have finalizers blocking deletion.",
"RBAC misconfiguration can block access to resources."
]
# ================================
# STEP 2: CLEANING FUNCTION
# ================================
def clean_text(text):
return text.strip().replace("\n", " ")
# ================================
# STEP 3: BASIC CHUNKING
# ================================
def chunk_documents(docs, chunk_size=3):
chunks = []
for i in range(0, len(docs), chunk_size):
chunk = " ".join(docs[i:i+chunk_size])
chunks.append(chunk)
return chunks
chunks = chunk_documents(documents, chunk_size=3)
# ================================
# STEP 4: ADD METADATA STRUCTURE
# ================================
prepared_data = []
for i, chunk in enumerate(chunks):
prepared_data.append({
"id": f"chunk_{i}",
"text": clean_text(chunk)
})
# ================================
# STEP 5: INSPECT OUTPUT
# ================================
for item in prepared_data[:5]:
print(item)
print("-" * 50)
# ================================
# OPTIONAL: SIZE CHECK
# ================================
print(f"Total raw documents: {len(documents)}")
print(f"Total chunks created: {len(prepared_data)}")
==========================================================================
{'id': 'chunk_0', 'text': 'CrashLoopBackOff occurs when a container repeatedly crashes after starting. A container may crash due to missing environment variables. Incorrect command or entrypoint can cause container startup failure.'} -------------------------------------------------- {'id': 'chunk_1', 'text': 'Application errors inside the container often lead to restarts. OOMKilled happens when a container exceeds its memory limit. ImagePullBackOff occurs when Kubernetes cannot pull the container image.'} -------------------------------------------------- {'id': 'chunk_2', 'text': 'Incorrect image name or tag can cause image pull failures. Private registries require imagePullSecrets for authentication. kubectl logs retrieves logs from a running container.'} -------------------------------------------------- {'id': 'chunk_3', 'text': 'kubectl describe pod shows events and state transitions. Pods remain pending if no node satisfies resource requests. Node affinity restricts pods to specific nodes.'} -------------------------------------------------- {'id': 'chunk_4', 'text': 'Taints prevent pods from being scheduled on certain nodes. Tolerations allow pods to be scheduled on tainted nodes. Liveness probes determine if a container should be restarted.'} -------------------------------------------------- Total raw documents: 27 Total chunks created: 9
No comments:
Post a Comment