Skip to content

Commit

Permalink
read vouches in TournesolInput
Browse files Browse the repository at this point in the history
  • Loading branch information
amatissart committed May 16, 2024
1 parent 049d72e commit dde4c9f
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 14 deletions.
12 changes: 12 additions & 0 deletions backend/ml/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ContributorScaling,
Entity,
)
from vouch.models import Voucher


class MlInputFromDb(TournesolInput):
Expand Down Expand Up @@ -189,3 +190,14 @@ def get_individual_scores(

dtf = pd.DataFrame(values)
return dtf[["user_id", "entity", "criteria", "raw_score"]]

def get_vouches(self):
values = Voucher.objects.filter(
by__is_active=True,
to__is_active=True,
).values(
voucher="by__id",
vouchee="to__id",
vouch="value",
)
return pd.DataFrame(values)
10 changes: 6 additions & 4 deletions backend/tournesol/lib/public_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def write_comparisons_file(
"criteria",
"score",
"score_max",
"week_date"
"week_date",
]
writer = csv.DictWriter(write_target, fieldnames=fieldnames)
writer.writeheader()
Expand Down Expand Up @@ -413,7 +413,9 @@ def write_vouchers_file(write_target):
"to_username": voucher.to.username,
"value": voucher.value,
}
for voucher in Voucher.objects.filter(is_public=True)
.select_related("by", "to")
.order_by("by__username", "to__username")
for voucher in (
Voucher.objects.filter(is_public=True, by__is_active=True, to__is_active=True)
.select_related("by", "to")
.order_by("by__username", "to__username")
)
)
44 changes: 34 additions & 10 deletions solidago/src/solidago/pipeline/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,26 @@ def get_individual_scores(
) -> Optional[pd.DataFrame]:
raise NotImplementedError

@abstractmethod
def get_vouches(self):
# TODO: make abstract and implement in subclasses
return pd.DataFrame(columns=["voucher", "vouchee", "vouch"])
"""Fetch data about vouches shared between users
Returns:
- DataFrame with columns
* `voucher`: int, user_id of the user who gives the vouch
* `vouchee`: int, user_id of the user who receives the vouch
* `vouch`: float, value of this vouch
"""
raise NotImplementedError

def get_users(self):
users = self.ratings_properties.groupby("user_id").first()[["trust_score"]]
users["is_pretrusted"] = users["trust_score"] >= 0.8
return users

def get_pipeline_kwargs(self, criterion: str):
ratings_properties = self.ratings_properties
users = ratings_properties.groupby("user_id").first()[["trust_score"]]
users["is_pretrusted"] = users["trust_score"] >= 0.8
users = self.get_users()
vouches = self.get_vouches()
comparisons = self.get_comparisons(criteria=criterion)
entities_ids = set(comparisons["entity_a"].unique()) | set(
Expand Down Expand Up @@ -134,26 +146,25 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]):
# Fill trust_score on newly created users for which it was not computed yet
self.users.trust_score = pd.to_numeric(self.users.trust_score).fillna(0.0)

username_to_user_id = pd.Series(
self.username_to_user_id = pd.Series(
data=self.users.index, index=self.users["public_username"]
)
self.comparisons = self.comparisons.join(username_to_user_id, on="public_username")
self.comparisons = self.comparisons.join(self.username_to_user_id, on="public_username")

with (zipfile.Path(zip_file) / "vouchers.csv").open(mode="rb") as vouchers_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.vouchers = pd.read_csv(vouchers_file, keep_default_na=False)

with (zipfile.Path(zip_file) / "collective_criteria_scores.csv").open(mode="rb") as collective_scores_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.collective_scores = pd.read_csv(collective_scores_file, keep_default_na=False)

with (zipfile.Path(zip_file) / "individual_criteria_scores.csv").open(mode="rb") as individual_scores_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.individual_scores = pd.read_csv(individual_scores_file, keep_default_na=False)


@classmethod
def download(cls) -> "TournesolInputFromPublicDataset":
Expand Down Expand Up @@ -197,3 +208,16 @@ def get_individual_scores(
) -> Optional[pd.DataFrame]:
# TODO: read contributor scores from individual_scores.csv
return None

def get_vouches(self):
vouchers = self.vouchers[
self.vouchers.by_username.isin(self.username_to_user_id.index)
& self.vouchers.to_username.isin(self.username_to_user_id.index)
]
return pd.DataFrame(
{
"voucher": vouchers.by_username.map(self.username_to_user_id),
"vouchee": vouchers.to_username.map(self.username_to_user_id),
"vouch": vouchers.value,
}
)

0 comments on commit dde4c9f

Please sign in to comment.