Add CV vector store
This commit is contained in:
parent
fb95e6e799
commit
42395de6d3
@ -8,6 +8,7 @@ from vacancies.main.models import Customer, CustomerCV
|
|||||||
from langchain.agents import create_agent
|
from langchain.agents import create_agent
|
||||||
from langchain_openai import ChatOpenAI
|
from langchain_openai import ChatOpenAI
|
||||||
from langgraph.checkpoint.memory import InMemorySaver
|
from langgraph.checkpoint.memory import InMemorySaver
|
||||||
|
from vacancies.main.vector_store import add_vectors, extract_features
|
||||||
|
|
||||||
SYSTEM_PROMPT = """
|
SYSTEM_PROMPT = """
|
||||||
Ты — карьерный копилот для ИТ. Ты можешь отвечать на любые вопросы по тематике карьеры.
|
Ты — карьерный копилот для ИТ. Ты можешь отвечать на любые вопросы по тематике карьеры.
|
||||||
@ -63,7 +64,7 @@ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> N
|
|||||||
async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
if not update.message.document:
|
if not update.message.document:
|
||||||
await context.bot.send_message(chat_id=update.effective_chat.id, text="Не удалось прочитать информацию из файла! Попробуйте другой формат.")
|
await context.bot.send_message(chat_id=update.effective_chat.id, text="Не удалось прочитать информацию из файла! Попробуйте другой формат.")
|
||||||
return
|
return
|
||||||
|
|
||||||
buffer = io.BytesIO()
|
buffer = io.BytesIO()
|
||||||
file = await update.message.document.get_file()
|
file = await update.message.document.get_file()
|
||||||
@ -75,6 +76,13 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
|||||||
customer_cv, _ = await CustomerCV.objects.aupdate_or_create(customer=customer, defaults=dict(
|
customer_cv, _ = await CustomerCV.objects.aupdate_or_create(customer=customer, defaults=dict(
|
||||||
content=resume,
|
content=resume,
|
||||||
))
|
))
|
||||||
|
features = extract_features(customer_cv.content)
|
||||||
|
add_vectors(
|
||||||
|
"cvs",
|
||||||
|
customer_cv.id,
|
||||||
|
features.model_dump(),
|
||||||
|
{'content': customer_cv.content, 'features_json': features.model_dump()},
|
||||||
|
)
|
||||||
|
|
||||||
await context.bot.send_message(chat_id=update.effective_chat.id, text="Отлично! Запомнил Ваше резюме.")
|
await context.bot.send_message(chat_id=update.effective_chat.id, text="Отлично! Запомнил Ваше резюме.")
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
from django.core.management import BaseCommand
|
from django.core.management import BaseCommand
|
||||||
from vacancies.main.models import Vacancy
|
from vacancies.main.models import Vacancy
|
||||||
import clickhouse_connect
|
import clickhouse_connect
|
||||||
from vacancies.main.vector_store import add_vacancy_vectors, extract_vacancy_features
|
from vacancies.main.vector_store import add_vectors, extract_features
|
||||||
|
|
||||||
clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123)
|
clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123)
|
||||||
|
|
||||||
@ -24,15 +24,28 @@ class Command(BaseCommand):
|
|||||||
help = "Collect vacancies from telegram messages"
|
help = "Collect vacancies from telegram messages"
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
for index, row in enumerate(clickhouse_client.query(query).result_rows):
|
import time
|
||||||
|
start_time = time.time()
|
||||||
|
result_rows = clickhouse_client.query(query).result_rows
|
||||||
|
print(f"query time: {time.time() - start_time:.4f}")
|
||||||
|
result_rows_len = len(result_rows)
|
||||||
|
for index, row in enumerate(result_rows):
|
||||||
(id, chat_username, telegram_id, message, timestamp) = row
|
(id, chat_username, telegram_id, message, timestamp) = row
|
||||||
|
|
||||||
link = f"https://t.me/{chat_username}/{telegram_id}"
|
link = f"https://t.me/{chat_username}/{telegram_id}"
|
||||||
features = extract_vacancy_features(message)
|
print(f"Processing {index}/{result_rows_len} link: {link}")
|
||||||
|
start_time = time.time()
|
||||||
|
features = extract_features(message)
|
||||||
|
print(f"ai time: {time.time() - start_time:.4f}")
|
||||||
vacancy, created = Vacancy.objects.get_or_create(
|
vacancy, created = Vacancy.objects.get_or_create(
|
||||||
link=link,
|
link=link,
|
||||||
defaults={'content': message, 'features_json': features.model_dump()}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
print(index, link)
|
start_time = time.time()
|
||||||
add_vacancy_vectors(vacancy.id, features.model_dump(), {"link": link})
|
add_vectors(
|
||||||
|
"vacancies",
|
||||||
|
vacancy.id,
|
||||||
|
features.model_dump(),
|
||||||
|
{'content': message, 'features_json': features.model_dump()},
|
||||||
|
)
|
||||||
|
print(f"write vector time: {time.time() - start_time:.4f}")
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
|
import asyncio
|
||||||
from django.core.management import BaseCommand
|
from django.core.management import BaseCommand
|
||||||
from vacancies.main.vector_store import search_similarities
|
from vacancies.main.vector_store import search_similarities
|
||||||
from vacancies.main.models import CustomerCV, RecommendedVacancy
|
from vacancies.main.models import CustomerCV, RecommendedVacancy
|
||||||
from vacancies.main.bot import application
|
from vacancies.main.bot import application
|
||||||
from vacancies.main.features_extractor import extract_vacancy_features
|
|
||||||
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
|
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
|
||||||
from qdrant_client.models import Filter, HasIdCondition
|
from qdrant_client.models import Filter, HasIdCondition
|
||||||
|
|
||||||
@ -14,23 +14,22 @@ class Command(BaseCommand):
|
|||||||
customer_cvs = CustomerCV.objects.all()
|
customer_cvs = CustomerCV.objects.all()
|
||||||
|
|
||||||
for customer_cv in customer_cvs:
|
for customer_cv in customer_cvs:
|
||||||
features = extract_vacancy_features(customer_cv.content)
|
|
||||||
recommended_vacancy_ids = RecommendedVacancy.objects.filter(
|
recommended_vacancy_ids = RecommendedVacancy.objects.filter(
|
||||||
customer=customer_cv.customer,
|
customer=customer_cv.customer,
|
||||||
).values_list('vacancy_id', flat=True)
|
).values_list('vacancy_id', flat=True)
|
||||||
|
|
||||||
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
|
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
|
||||||
search_result_id = search_similarities(features.model_dump(), query_filter)
|
search_result_id = search_similarities(query_filter, customer_cv.id)
|
||||||
|
|
||||||
recommendation = RecommendedVacancy.objects.create(
|
recommendation = RecommendedVacancy.objects.create(
|
||||||
customer=customer_cv.customer,
|
customer=customer_cv.customer,
|
||||||
vacancy_id=search_result_id,
|
vacancy_id=search_result_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
application.bot.send_message(
|
asyncio.run(application.bot.send_message(
|
||||||
chat_id=recommendation.customer.chat_id,
|
chat_id=recommendation.customer.chat_id,
|
||||||
text=recommendation.vacancy.content,
|
text=recommendation.vacancy.content,
|
||||||
reply_markup=InlineKeyboardMarkup([[
|
reply_markup=InlineKeyboardMarkup([[
|
||||||
InlineKeyboardButton("Откликнуться", url=recommendation.vacancy.link),
|
InlineKeyboardButton("Откликнуться", url=recommendation.vacancy.link),
|
||||||
]]),
|
]]),
|
||||||
)
|
))
|
||||||
|
|||||||
@ -0,0 +1,17 @@
|
|||||||
|
# Generated by Django 5.2.7 on 2025-10-26 16:14
|
||||||
|
|
||||||
|
from django.db import migrations
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('main', '0005_remove_vacancy_is_processed'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='vacancy',
|
||||||
|
name='features_json',
|
||||||
|
),
|
||||||
|
]
|
||||||
@ -34,7 +34,6 @@ class CustomerCV(models.Model):
|
|||||||
class Vacancy(models.Model):
|
class Vacancy(models.Model):
|
||||||
content = models.TextField()
|
content = models.TextField()
|
||||||
link = models.URLField(unique=True)
|
link = models.URLField(unique=True)
|
||||||
features_json = models.JSONField(null=True, blank=True)
|
|
||||||
created_at = models.DateTimeField(auto_now_add=True)
|
created_at = models.DateTimeField(auto_now_add=True)
|
||||||
|
|
||||||
objects = models.Manager()
|
objects = models.Manager()
|
||||||
|
|||||||
@ -17,10 +17,14 @@ vectors_config = {
|
|||||||
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
|
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
|
||||||
}
|
}
|
||||||
|
|
||||||
collection_name = "vacancies"
|
if not client.collection_exists("vacancies"):
|
||||||
if not client.collection_exists(collection_name):
|
|
||||||
client.create_collection(
|
client.create_collection(
|
||||||
collection_name=collection_name,
|
collection_name="vacancies",
|
||||||
|
vectors_config=vectors_config
|
||||||
|
)
|
||||||
|
if not client.collection_exists("cvs"):
|
||||||
|
client.create_collection(
|
||||||
|
collection_name="cvs",
|
||||||
vectors_config=vectors_config
|
vectors_config=vectors_config
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -39,7 +43,7 @@ def _prepare_texts(features):
|
|||||||
return texts
|
return texts
|
||||||
|
|
||||||
|
|
||||||
def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict):
|
def add_vectors(collection_name: str, _id: int, features: dict, payload: dict):
|
||||||
"""Add vectors for a vacancy based on its features."""
|
"""Add vectors for a vacancy based on its features."""
|
||||||
texts = _prepare_texts(features)
|
texts = _prepare_texts(features)
|
||||||
vectors = {}
|
vectors = {}
|
||||||
@ -52,7 +56,7 @@ def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict):
|
|||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
points=[
|
points=[
|
||||||
models.PointStruct(
|
models.PointStruct(
|
||||||
id=vacancy_id,
|
id=_id,
|
||||||
vector=vectors,
|
vector=vectors,
|
||||||
payload=payload,
|
payload=payload,
|
||||||
)
|
)
|
||||||
@ -60,20 +64,18 @@ def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def search_similarities(query_features: dict, query_filter: Filter) -> list[dict]:
|
def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]:
|
||||||
texts = _prepare_texts(query_features)
|
vectors = client.retrieve(
|
||||||
vectors = {}
|
collection_name="cvs",
|
||||||
for name, text in texts.items():
|
ids=[cv_id],
|
||||||
vectors[name] = [0.0] * 3072
|
with_vectors=True,
|
||||||
if text:
|
)[0].vector
|
||||||
vec = embedding.embed_query(text)
|
|
||||||
vectors[name] = vec
|
|
||||||
|
|
||||||
max_similarities = {}
|
max_similarities = {}
|
||||||
for name, vec in vectors.items():
|
for name, vec in vectors.items():
|
||||||
if any(v != 0 for v in vec):
|
if any(v != 0 for v in vec):
|
||||||
results = client.search(
|
results = client.search(
|
||||||
collection_name=collection_name,
|
collection_name="vacancies",
|
||||||
query_vector=(name, vec),
|
query_vector=(name, vec),
|
||||||
limit=1000,
|
limit=1000,
|
||||||
with_payload=True,
|
with_payload=True,
|
||||||
@ -95,7 +97,7 @@ def search_similarities(query_features: dict, query_filter: Filter) -> list[dict
|
|||||||
return scored[0]["id"]
|
return scored[0]["id"]
|
||||||
|
|
||||||
|
|
||||||
def extract_vacancy_features(content: str) -> VacancyFeatures:
|
def extract_features(content: str) -> VacancyFeatures:
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
|
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
|
||||||
Features:
|
Features:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user