Fix vacancies recommendations weights
This commit is contained in:
parent
cf9f19a216
commit
9dceaaeccc
@ -5,23 +5,7 @@ from vacancies.main.vector_store import add_vectors, extract_features, client as
|
||||
|
||||
clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123)
|
||||
|
||||
next_page_offset = 0
|
||||
exist_points_ids = []
|
||||
while next_page_offset is not None:
|
||||
response = qdrant.scroll(
|
||||
collection_name="vacancies",
|
||||
limit=100_000,
|
||||
offset=next_page_offset,
|
||||
with_payload=False,
|
||||
with_vectors=False,
|
||||
timeout=30,
|
||||
)
|
||||
exist_points_ids.extend([point.id for point in response[0]])
|
||||
next_page_offset = response[1]
|
||||
exist_points_set = tuple(exist_points_ids)
|
||||
print("qdrant vacancies points count:", len(exist_points_set))
|
||||
|
||||
query = f"""
|
||||
query = """
|
||||
SELECT id, chat_username, telegram_id, message, timestamp
|
||||
FROM telegram_parser_chatmessage
|
||||
WHERE timestamp >= now() - INTERVAL 30 DAY
|
||||
@ -33,7 +17,7 @@ WHERE timestamp >= now() - INTERVAL 30 DAY
|
||||
'удаленно', 'гибкий график', 'полный день', 'частичная занятость',
|
||||
'резюме', 'собеседование', 'junior', 'middle', 'senior'
|
||||
]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0 AND position(message, '#ищу') = 0
|
||||
AND id NOT IN {exist_points_set}
|
||||
AND id NOT IN %(exist_points)s
|
||||
"""
|
||||
|
||||
|
||||
@ -41,28 +25,36 @@ class Command(BaseCommand):
|
||||
help = "Collect vacancies from telegram messages"
|
||||
|
||||
def handle(self, *args, **options):
|
||||
import time
|
||||
start_time = time.time()
|
||||
result_rows = clickhouse_client.query(query).result_rows
|
||||
print(f"query time: {time.time() - start_time:.4f}")
|
||||
next_page_offset = 0
|
||||
exist_points_ids = [-1]
|
||||
while next_page_offset is not None:
|
||||
response = qdrant.scroll(
|
||||
collection_name="vacancies",
|
||||
limit=100_000,
|
||||
offset=next_page_offset,
|
||||
with_payload=False,
|
||||
with_vectors=False,
|
||||
timeout=30,
|
||||
)
|
||||
exist_points_ids.extend([point.id for point in response[0]])
|
||||
next_page_offset = response[1]
|
||||
exist_points_set = tuple(set(exist_points_ids))
|
||||
|
||||
result_rows = clickhouse_client.query(query, parameters={"exist_points": exist_points_set}).result_rows
|
||||
result_rows_len = len(result_rows)
|
||||
for index, row in enumerate(result_rows):
|
||||
(id, chat_username, telegram_id, message, timestamp) = row
|
||||
|
||||
link = f"https://t.me/{chat_username}/{telegram_id}"
|
||||
print(f"Processing {index}/{result_rows_len} link: {link}")
|
||||
start_time = time.time()
|
||||
features = extract_features(message)
|
||||
print(f"ai time: {time.time() - start_time:.4f}")
|
||||
vacancy, created = Vacancy.objects.get_or_create(
|
||||
link=link,
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
add_vectors(
|
||||
"vacancies",
|
||||
vacancy.id,
|
||||
features.model_dump(),
|
||||
{'content': message, 'features_json': features.model_dump()},
|
||||
)
|
||||
print(f"write vector time: {time.time() - start_time:.4f}")
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import asyncio
|
||||
|
||||
from django.core.management import BaseCommand
|
||||
from vacancies.main.vector_store import search_similarities, client
|
||||
from vacancies.main.vector_store import search_similarities
|
||||
from vacancies.main.models import CustomerCV, RecommendedVacancy
|
||||
from vacancies.main.bot import application
|
||||
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
|
||||
@ -19,12 +20,7 @@ class Command(BaseCommand):
|
||||
).values_list('vacancy_id', flat=True)
|
||||
|
||||
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
|
||||
search_result_id = search_similarities(query_filter, customer_cv.id)
|
||||
|
||||
vacancy_content = client.retrieve(
|
||||
collection_name="vacancies",
|
||||
ids=[search_result_id],
|
||||
)[0].payload["content"]
|
||||
search_result_id, vacancy_content = search_similarities(query_filter, customer_cv.id)
|
||||
|
||||
recommendation = RecommendedVacancy.objects.create(
|
||||
customer=customer_cv.customer,
|
||||
|
||||
@ -62,6 +62,7 @@ class RecommendedVacancy(models.Model):
|
||||
|
||||
|
||||
class VacancyFeatures(BaseModel):
|
||||
job_title: str | None = None # Должность
|
||||
employment_type: str | None = None # Тип занятости
|
||||
work_format: str | None = None # Формат работы
|
||||
experience: str | None = None # Опыт работы
|
||||
|
||||
@ -9,15 +9,16 @@ from vacancies.main.models import VacancyFeatures
|
||||
client = QdrantClient(url="http://localhost:6333")
|
||||
|
||||
FEATURE_NAMES = [
|
||||
"employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
|
||||
"job_title", "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
|
||||
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
|
||||
]
|
||||
|
||||
weights = {
|
||||
"job_title": 10,
|
||||
"employment_type": 2,
|
||||
"work_format": 2,
|
||||
"experience": 4,
|
||||
"position_level": 4,
|
||||
"experience": 3,
|
||||
"position_level": 5,
|
||||
"industry": 4,
|
||||
"tech_stack": 5,
|
||||
"location": 2,
|
||||
@ -87,35 +88,40 @@ def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]:
|
||||
)[0].vector
|
||||
|
||||
max_similarities = {}
|
||||
vacancies_content = {}
|
||||
for name, vec in vectors.items():
|
||||
if any(v != 0 for v in vec):
|
||||
results = client.search(
|
||||
results = client.query_points(
|
||||
collection_name="vacancies",
|
||||
query_vector=(name, vec),
|
||||
query=vec,
|
||||
using=name,
|
||||
limit=1000,
|
||||
with_payload=True,
|
||||
query_filter=query_filter,
|
||||
)
|
||||
for res in results:
|
||||
for res in results.points:
|
||||
vid = res.id
|
||||
sim = res.score
|
||||
if vid not in max_similarities:
|
||||
max_similarities[vid] = {}
|
||||
max_similarities[vid][name] = sim
|
||||
if vid not in vacancies_content:
|
||||
vacancies_content[vid] = res.payload["content"]
|
||||
|
||||
scored = []
|
||||
for vid, feature_sims in max_similarities.items():
|
||||
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
|
||||
scored.append({"id": vid, "score": total})
|
||||
scored.append({"id": vid, "score": total, "content": vacancies_content[vid]})
|
||||
|
||||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||||
return scored[0]["id"]
|
||||
return scored[0]["id"], scored[0]["content"]
|
||||
|
||||
|
||||
def extract_features(content: str) -> VacancyFeatures:
|
||||
prompt = f"""
|
||||
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
|
||||
Features:
|
||||
- job_title: Должность (e.g., DevOps, Python программист)
|
||||
- employment_type: Тип занятости (e.g., Полная занятость, Частичная)
|
||||
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
|
||||
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user