Создаём полноценный поисковик по документации с нуля: сбор данных, индексация, поиск, гибридный поиск, деплой.
Создаём полноценный поисковик по документации с нуля: сбор данных, индексация, поиск, гибридный поиск, деплой.
За 90 минут вы создадите полноценный поисковик:
# docs_collector.py
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import json
class DocsCollector:
"""Сбор документации с веб-сайта."""
def __init__(self, base_url: str, output_dir: str = "docs"):
self.base_url = base_url
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def fetch_page(self, url: str) -> str:
"""Получить HTML страницы."""
response = requests.get(url)
response.raise_for_status()
return response.text
def parse_content(self, html: str) -> dict:
"""Извлечь заголовок и контент."""
soup = BeautifulSoup(html, 'html.parser')
title = soup.find('h1')
title = title.text.strip() if title else "Без заголовка"
# Извлечь основной контент
content = []
for tag in soup.find_all(['p', 'code', 'pre']):
text = tag.get_text(strip=True)
if text:
content.append(text)
return {
"title": title,
"content": "\n".join(content),
"url": self.base_url
}
def collect(self, urls: list[str]) -> list[dict]:
"""Собрать документы по списку URL."""
documents = []
for i, url in enumerate(urls):
print(f"Обработка {i+1}/{len(urls)}: {url}")
try:
html = self.fetch_page(url)
doc = self.parse_content(html)
documents.append(doc)
except Exception as e:
print(f"Ошибка {url}: {e}")
# Сохранение
with open(self.output_dir / "documents.json", "w") as f:
json.dump(documents, f, indent=2, ensure_ascii=False)
print(f"Собрано документов: {len(documents)}")
return documents
# Использование
if __name__ == "__main__":
urls = [
"https://docs.python.org/3/tutorial/introduction.html",
"https://docs.python.org/3/tutorial/datastructures.html",
# Добавьте больше URL
]
collector = DocsCollector("")
documents = collector.collect(urls)# chunker.py
class TextChunker:
"""Разбиение текста на чанки."""
def __init__(
self,
chunk_size: int = 500,
chunk_overlap: int = 50
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_text(self, text: str) -> list[str]:
"""Разбить текст на чанки."""
chunks = []
start = 0
while start < len(text):
end = start + self.chunk_size
chunk = text[start:end]
# Не обрезать посередине предложения
if end < len(text):
last_period = chunk.rfind('.')
if last_period > self.chunk_size // 2:
chunk = chunk[:last_period + 1]
end = start + last_period + 1
chunks.append(chunk.strip())
start = end - self.chunk_overlap
return chunks
def chunk_documents(self, documents: list[dict]) -> list[dict]:
"""Разбить документы на чанки."""
chunks = []
for doc in documents:
doc_chunks = self.chunk_text(doc['content'])
for i, chunk in enumerate(doc_chunks):
chunks.append({
"source": doc.get('title', 'Unknown'),
"url": doc.get('url', ''),
"chunk_id": i,
"content": chunk
})
print(f"Создано чанков: {len(chunks)}")
return chunks
# Использование
chunker = TextChunker(chunk_size=500, chunk_overlap=50)
chunks = chunker.chunk_documents(documents)# indexer.py
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
class DocsIndexer:
"""Индексация документации."""
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
self.dimension = self.model.get_sentence_embedding_dimension()
self.index = None
self.chunks = []
def create_index(self, chunks: list[dict]):
"""Создать индекс."""
self.chunks = chunks
texts = [c['content'] for c in chunks]
print("Генерация эмбеддингов...")
embeddings = self.model.encode(texts, batch_size=64, show_progress_bar=True)
embeddings = embeddings.astype('float32')
print("Создание HNSW индекса...")
self.index = faiss.IndexHNSWFlat(self.dimension, M=32)
self.index.hnsw.efConstruction = 200
faiss.normalize_L2(embeddings)
self.index.add(embeddings)
print(f"Индекс создан: {self.index.ntotal} векторов")
def save(self, path: str):
"""Сохранить индекс."""
faiss.write_index(self.index, f"{path}.faiss")
with open(f"{path}.chunks.pkl", "wb") as f:
pickle.dump(self.chunks, f)
print(f"Индекс сохранён: {path}")
def load(self, path: str):
"""Загрузить индекс."""
self.index = faiss.read_index(f"{path}.faiss")
with open(f"{path}.chunks.pkl", "rb") as f:
self.chunks = pickle.load(f)
print(f"Индекс загружен: {self.index.ntotal} векторов")
# Использование
indexer = DocsIndexer()
indexer.create_index(chunks)
indexer.save("docs_index")# search_service.py
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
class DocsSearch:
"""Поисковик по документации."""
def __init__(self, index_path: str):
self.index = faiss.read_index(f"{index_path}.faiss")
with open(f"{index_path}.chunks.pkl", "rb") as f:
self.chunks = pickle.load(f)
self.model = SentenceTransformer('all-MiniLM-L6-v2')
def search(
self,
query: str,
k: int = 5,
min_score: float = 0.3
) -> list[dict]:
"""Поиск по документации."""
query_embedding = self.model.encode([query]).astype('float32')
faiss.normalize_L2(query_embedding)
distances, indices = self.index.search(query_embedding, k)
results = []
for idx, score in zip(indices[0], distances[0]):
if score >= min_score and idx < len(self.chunks):
chunk = self.chunks[idx]
results.append({
"score": float(score),
"title": chunk.get('source', 'Unknown'),
"content": chunk['content'][:200] + "...",
"url": chunk.get('url', '')
})
return results
def search_with_highlights(
self,
query: str,
k: int = 5
) -> list[dict]:
"""Поиск с подсветкой совпадений."""
results = self.search(query, k)
query_words = set(query.lower().split())
for result in results:
content = result['content']
highlighted = content
for word in query_words:
if len(word) > 3:
highlighted = highlighted.replace(
word,
f"**{word}**"
)
result['highlighted'] = highlighted
return results
# Использование
search = DocsSearch("docs_index")
results = search.search("как использовать list comprehension", k=5)
for r in results:
print(f"\n[{r['score']:.3f}] {r['title']}")
print(f"URL: {r['url']}")
print(f"Контент: {r['content']}")# hybrid_search.py
from rank_bm25 import BM25Okapi
class HybridDocsSearch(DocsSearch):
"""Гибридный поисковик по документации."""
def __init__(self, index_path: str):
super().__init__(index_path)
# BM25 индекс
self.tokenized_chunks = [
c['content'].lower().split()
for c in self.chunks
]
self.bm25 = BM25Okapi(self.tokenized_chunks)
def hybrid_search(
self,
query: str,
k: int = 10,
alpha: float = 0.7
) -> list[dict]:
"""Гибридный поиск."""
# BM25
query_tokens = query.lower().split()
bm25_scores = self.bm25.get_scores(query_tokens)
# Векторный
query_embedding = self.model.encode([query]).astype('float32')
faiss.normalize_L2(query_embedding)
distances, indices = self.index.search(query_embedding, k=len(self.chunks))
vector_scores = np.zeros(len(self.chunks))
vector_scores[indices[0]] = distances[0]
# Нормализация
bm25_norm = self._normalize(bm25_scores)
vector_norm = self._normalize(vector_scores)
# Комбинирование
combined = alpha * vector_norm + (1 - alpha) * bm25_norm
# Топ-k
top_indices = np.argsort(combined)[::-1][:k]
results = []
for idx in top_indices:
if combined[idx] > 0.1:
chunk = self.chunks[idx]
results.append({
"score": float(combined[idx]),
"title": chunk['source'],
"content": chunk['content'][:200] + "...",
"url": chunk.get('url', '')
})
return results
def _normalize(self, scores: np.ndarray) -> np.ndarray:
min_s, max_s = scores.min(), scores.max()
if max_s - min_s < 1e-6:
return np.zeros_like(scores)
return (scores - min_s) / (max_s - min_s)
# Использование
hybrid_search = HybridDocsSearch("docs_index")
results = hybrid_search.hybrid_search("Python list методы", k=5, alpha=0.7)# app.py (FastAPI)
from fastapi import FastAPI, Query
from pydantic import BaseModel
from typing import List
app = FastAPI(title="Docs Search API")
search = DocsSearch("docs_index")
class SearchResult(BaseModel):
score: float
title: str
content: str
url: str
@app.get("/search", response_model=List[SearchResult])
def search_docs(
q: str = Query(..., description="Поисковый запрос"),
k: int = Query(5, description="Количество результатов")
):
"""Поиск по документации."""
return search.search(q, k)
@app.get("/health")
def health():
"""Проверка здоровья."""
return {"status": "ok"}
# Запуск:
# uvicorn app:app --host 0.0.0.0 --port 8000# Dockerfile
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]# docker-compose.yml
version: '3.8'
services:
docs-search:
build: .
ports:
- "8000:8000"
volumes:
- ./docs_index.faiss:/app/docs_index.faiss
- ./docs_index.chunks.pkl:/app/docs_index.chunks.pkl
restart: unless-stoppedВы прошли весь курс и создали полноценный поисковик по документации!
Что дальше:
Вопросы ещё не добавлены
Вопросы для этой подтемы ещё не добавлены.