Fix vacancies recommendations weights

This commit is contained in:
estromenko 2025-10-30 00:30:37 +03:00
parent cf9f19a216
commit 9dceaaeccc
4 changed files with 36 additions and 41 deletions

View File

@ -5,23 +5,7 @@ from vacancies.main.vector_store import add_vectors, extract_features, client as
clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123)
next_page_offset = 0
exist_points_ids = []
while next_page_offset is not None:
response = qdrant.scroll(
collection_name="vacancies",
limit=100_000,
offset=next_page_offset,
with_payload=False,
with_vectors=False,
timeout=30,
)
exist_points_ids.extend([point.id for point in response[0]])
next_page_offset = response[1]
exist_points_set = tuple(exist_points_ids)
print("qdrant vacancies points count:", len(exist_points_set))
query = f"""
query = """
SELECT id, chat_username, telegram_id, message, timestamp
FROM telegram_parser_chatmessage
WHERE timestamp >= now() - INTERVAL 30 DAY
@ -33,7 +17,7 @@ WHERE timestamp >= now() - INTERVAL 30 DAY
'удаленно', 'гибкий график', 'полный день', 'частичная занятость',
'резюме', 'собеседование', 'junior', 'middle', 'senior'
]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0 AND position(message, '#ищу') = 0
AND id NOT IN {exist_points_set}
AND id NOT IN %(exist_points)s
"""
@ -41,28 +25,36 @@ class Command(BaseCommand):
help = "Collect vacancies from telegram messages"
def handle(self, *args, **options):
import time
start_time = time.time()
result_rows = clickhouse_client.query(query).result_rows
print(f"query time: {time.time() - start_time:.4f}")
next_page_offset = 0
exist_points_ids = [-1]
while next_page_offset is not None:
response = qdrant.scroll(
collection_name="vacancies",
limit=100_000,
offset=next_page_offset,
with_payload=False,
with_vectors=False,
timeout=30,
)
exist_points_ids.extend([point.id for point in response[0]])
next_page_offset = response[1]
exist_points_set = tuple(set(exist_points_ids))
result_rows = clickhouse_client.query(query, parameters={"exist_points": exist_points_set}).result_rows
result_rows_len = len(result_rows)
for index, row in enumerate(result_rows):
(id, chat_username, telegram_id, message, timestamp) = row
link = f"https://t.me/{chat_username}/{telegram_id}"
print(f"Processing {index}/{result_rows_len} link: {link}")
start_time = time.time()
features = extract_features(message)
print(f"ai time: {time.time() - start_time:.4f}")
vacancy, created = Vacancy.objects.get_or_create(
link=link,
)
start_time = time.time()
add_vectors(
"vacancies",
vacancy.id,
features.model_dump(),
{'content': message, 'features_json': features.model_dump()},
)
print(f"write vector time: {time.time() - start_time:.4f}")

View File

@ -1,6 +1,7 @@
import asyncio
from django.core.management import BaseCommand
from vacancies.main.vector_store import search_similarities, client
from vacancies.main.vector_store import search_similarities
from vacancies.main.models import CustomerCV, RecommendedVacancy
from vacancies.main.bot import application
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
@ -19,12 +20,7 @@ class Command(BaseCommand):
).values_list('vacancy_id', flat=True)
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
search_result_id = search_similarities(query_filter, customer_cv.id)
vacancy_content = client.retrieve(
collection_name="vacancies",
ids=[search_result_id],
)[0].payload["content"]
search_result_id, vacancy_content = search_similarities(query_filter, customer_cv.id)
recommendation = RecommendedVacancy.objects.create(
customer=customer_cv.customer,

View File

@ -62,6 +62,7 @@ class RecommendedVacancy(models.Model):
class VacancyFeatures(BaseModel):
job_title: str | None = None # Должность
employment_type: str | None = None # Тип занятости
work_format: str | None = None # Формат работы
experience: str | None = None # Опыт работы

View File

@ -9,15 +9,16 @@ from vacancies.main.models import VacancyFeatures
client = QdrantClient(url="http://localhost:6333")
FEATURE_NAMES = [
"employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
"job_title", "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
]
weights = {
"job_title": 10,
"employment_type": 2,
"work_format": 2,
"experience": 4,
"position_level": 4,
"experience": 3,
"position_level": 5,
"industry": 4,
"tech_stack": 5,
"location": 2,
@ -87,35 +88,40 @@ def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]:
)[0].vector
max_similarities = {}
vacancies_content = {}
for name, vec in vectors.items():
if any(v != 0 for v in vec):
results = client.search(
results = client.query_points(
collection_name="vacancies",
query_vector=(name, vec),
query=vec,
using=name,
limit=1000,
with_payload=True,
query_filter=query_filter,
)
for res in results:
for res in results.points:
vid = res.id
sim = res.score
if vid not in max_similarities:
max_similarities[vid] = {}
max_similarities[vid][name] = sim
if vid not in vacancies_content:
vacancies_content[vid] = res.payload["content"]
scored = []
for vid, feature_sims in max_similarities.items():
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
scored.append({"id": vid, "score": total})
scored.append({"id": vid, "score": total, "content": vacancies_content[vid]})
scored.sort(key=lambda x: x["score"], reverse=True)
return scored[0]["id"]
return scored[0]["id"], scored[0]["content"]
def extract_features(content: str) -> VacancyFeatures:
prompt = f"""
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
Features:
- job_title: Должность (e.g., DevOps, Python программист)
- employment_type: Тип занятости (e.g., Полная занятость, Частичная)
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта)