Overhauled contributors file to better handle duplicates

2026-05-07 02:03:55 -05:00 · 2024-08-18 13:04:07 -04:00
parent 00757f3931
commit 2938817158
1 changed files with 391 additions and 219 deletions
--- a/.github/workflows/contributors/update_contributors.py
+++ b/.github/workflows/contributors/update_contributors.py
@@ -10,267 +10,439 @@ from absl import logging

 CONTRIBUTORS_FILE = ".all-contributorsrc"

-EXCLUDED_USERS = {"web-flow", "github-actions[bot]", "mrdragonbear", "jveejay",
-                  "Matthew Steward"}
+EXCLUDED_USERS = {
+    "web-flow",
+    "github-actions[bot]",
+    "mrdragonbear",
+    "jveejay",
+    "Matthew Steward",
+}

 OWNER = "harvard-edge"
 REPO = "cs249r_book"
 BRANCH = "dev"
+RESULTS_PER_PAGE = 1000


 def get_user_data_from_username(username):
-  headers = {"Authorization": f"token {os.environ['GITHUB_TOKEN']}"}
-  res = requests.get(f"https://api.github.com/users/{username}",
-                     headers=headers)
-  user_full_name = pd.NA
-  email_address = pd.NA
-  if res.status_code == 200:
-    user_data = res.json()
-    user_full_name = user_data['name']
-    email_address = user_data['email']
-  else:
-    logging.error(f'Could not find user with username: {username}')
-  return {'username': username, 'user_full_name': user_full_name,
-          'email_address': email_address}
+    headers = {"Authorization": f"token {os.environ['GITHUB_TOKEN']}"}
+    res = requests.get(f"https://api.github.com/users/{username}", headers=headers)
+    user_full_name = pd.NA
+    email_address = pd.NA
+    if res.status_code == 200:
+        user_data = res.json()
+        user_full_name = user_data["name"]
+        email_address = user_data["email"]
+    else:
+        logging.error(f"Could not find user with username: {username}")
+    return {
+        "username": username,
+        "user_full_name": user_full_name,
+        "email_address": email_address,
+    }


 def get_user_data_from_email(email_address):
-  headers = {"Authorization": f"token {os.environ['GITHUB_TOKEN']}"}
-  res = requests.get(f"https://api.github.com/search/users?q={email_address}",
-                     headers=headers)
-  username = pd.NA
-  if res.status_code == 200:
-    user_data = res.json()
-    if user_data['total_count'] > 0:
-      username = user_data['items'][0]['login']
-  else:
-    logging.error(f'Could not find user with email address: {email_address}')
-  return {'username': username, 'user_full_name': pd.NA,
-          'email_address': email_address}
+    headers = {"Authorization": f"token {os.environ['GITHUB_TOKEN']}"}
+    res = requests.get(
+        f"https://api.github.com/search/users?q={email_address}", headers=headers
+    )
+    username = pd.NA
+    if res.status_code == 200:
+        user_data = res.json()
+        if user_data["total_count"] > 0:
+            username = user_data["items"][0]["login"]
+    else:
+        logging.error(f"Could not find user with email address: {email_address}")
+    return {
+        "username": username,
+        "user_full_name": pd.NA,
+        "email_address": email_address,
+    }


 def get_co_authors_from_commit_message(commit_message):
-  co_author_data = []
-  if commit_message:
-    lines = commit_message.splitlines()
-    for line in lines:
-      try:
-        if line.startswith("Co-authored-by:"):
-          co_author = line.split(":")[1].strip()
-          user_full_name, email_address = co_author.split("<")
-          user_full_name = user_full_name.strip()
-          email_address = email_address.strip(">")
-          co_author_data.append(
-              {'user_full_name': user_full_name,
-               'email_address': email_address})
-      except ValueError as e:
-        logging.error(
-            f"Error parsing co-author: {line}. Co-author should be of the form: "
-            f"'Co-authored-by: NAME <email>'. "
-            f"Remember to include the angle brackets around the email."
-        )
-    return pd.DataFrame(co_author_data)
+    co_author_data = []
+    if commit_message:
+        lines = commit_message.splitlines()
+        for line in lines:
+            try:
+                if line.startswith("Co-authored-by:"):
+                    co_author = line.split(":")[1].strip()
+                    user_full_name, email_address = co_author.split("<")
+                    user_full_name = user_full_name.strip()
+                    email_address = email_address.strip(">")
+                    co_author_data.append(
+                        {
+                            "user_full_name": user_full_name,
+                            "email_address": email_address,
+                            "username": pd.NA,
+                        }
+                    )
+            except ValueError as e:
+                logging.error(
+                    f"Error parsing co-author: {line}. Co-author should be of the form: "
+                    f"'Co-authored-by: NAME <email>'. "
+                    f"Remember to include the angle brackets around the email."
+                )
+        return pd.DataFrame(co_author_data)
+
+
+def merge_user_full_names(row, col1, col2):
+    """
+    Merges two columns containing user full names based on the following criteria:
+    - Takes the longest name that is not null and not an email address.
+
+    Parameters:
+    - row: A single row from the DataFrame.
+    - col1: The first column name containing user full names.
+    - col2: The second column name containing user full names.
+
+    Returns:
+    - The merged user full name based on the criteria.
+    """
+
+    def is_email(string):
+        return isinstance(string, str) and "@" in string
+
+    name1 = row[col1]
+    name2 = row[col2]
+
+    if (
+        pd.notna(name1)
+        and not is_email(name1)
+        and (pd.isna(name2) or len(name1) >= len(name2))
+    ):
+        return name1
+    elif pd.notna(name2) and not is_email(name2):
+        return name2
+    else:
+        return pd.NA
+
+
+def merge_email_addresses(row, col1, col2):
+    """
+    Merges two columns containing email addresses based on the following criteria:
+    - Returns the email address that is not null.
+    - If both email addresses are not null, it prioritizes the one that does not contain 'noreply.github.com'.
+
+    Parameters:
+    - row: A single row from the DataFrame.
+    - col1: The first column name containing email addresses.
+    - col2: The second column name containing email addresses.
+
+    Returns:
+    - The selected email address based on the criteria, or pd.NA if both are null.
+    """
+
+    email1 = row[col1]
+    email2 = row[col2]
+
+    # Check if either email is not null
+    if pd.notna(email1) and "noreply.github.com" not in email1:
+        return email1
+    elif pd.notna(email2) and "noreply.github.com" not in email2:
+        return email2
+    elif pd.notna(email1):
+        return email1
+    elif pd.notna(email2):
+        return email2
+    else:
+        return pd.NA


 def main(_):
-  token = os.environ["GITHUB_TOKEN"]
-  headers = {"Authorization": f"token {token}"}
-  data = []
-  next_page = (
-      f"https://api.github.com/repos/{OWNER}/{REPO}/commits?sha={BRANCH}&per_page=500"
-  )
-  last_page = None
-  while next_page != last_page:
-    print(f"Fetching page: {next_page}")
-    res = requests.get(next_page, headers=headers)
-    data.extend(res.json())
-    next_page = res.links.get("next", {}).get("url", None)
-    last_page = res.links.get("last", {}).get("url", None)
+    token = os.environ["GITHUB_TOKEN"]
+    headers = {"Authorization": f"token {token}"}
+    data = []
+    next_page = f"https://api.github.com/repos/{OWNER}/{REPO}/commits?sha={BRANCH}&per_page={RESULTS_PER_PAGE}"
+    last_page = None
+    while next_page != last_page:
+        print(f"Fetching page: {next_page}")
+        res = requests.get(next_page, headers=headers)
+        data.extend(res.json())
+        next_page = res.links.get("next", {}).get("url", None)
+        last_page = res.links.get("last", {}).get("url", None)

-  usernames = set()
-  commit_data = []
-  for node in data:
-    commit_message = node.get("commit", {}).get("message", pd.NA)
-    commit_info = node.get("commit", None)
-    commit_author_info = commit_info.get("author", None)
-    commit_commiter_info = commit_info.get("committer", None)
-    author_info = node.get("author", None)
-    committer_info = node.get("committer", None)
-    committer_login_info = (
-        committer_info.get("login", None) if committer_info else None
-    )
-    user_full_name = pd.NA
-    username = pd.NA
+    # Parse the commit response data
+    commit_data = []
+    for node in data:
+        commit_message = node.get("commit", {}).get("message", pd.NA)
+        commit_info = node.get("commit", None)
+        commit_author_info = commit_info.get("author", None)
+        commit_commiter_info = commit_info.get("committer", None)
+        author_info = node.get("author", None)
+        committer_info = node.get("committer", None)
+        committer_login_info = (
+            committer_info.get("login", None) if committer_info else None
+        )
+        user_full_name = pd.NA
+        user_login = pd.NA
+        user_email_address = pd.NA

-    if commit_author_info:
-      user_full_name = commit_author_info["name"]
-    elif commit_commiter_info:
-      user_full_name = commit_commiter_info["name"]
+        if commit_author_info:
+            user_full_name = commit_author_info["name"]
+            user_email_address = commit_author_info["email"]
+        elif commit_commiter_info:
+            user_full_name = commit_commiter_info["name"]

-    if author_info:
-      username = author_info["login"]
-    elif committer_login_info:
-      username = committer_login_info["login"]
+        if author_info:
+            user_login = author_info["login"]
+        elif committer_login_info:
+            user_login = committer_login_info["login"]

-    try:
-        if username:
-            usernames.add(username)
-    except Exception as e:
-        logging.error(f"Error parsing username: {username}")
+        commit_data.append(
+            {
+                "commit_message": commit_message,
+                "user_full_name": user_full_name,
+                "email_address": user_email_address,
+                "username": user_login,
+            }
+        )
+    commit_data_df = pd.DataFrame(commit_data)

+    # Parse the co-author data from the commit messages
+    co_authors_list = [
+        get_co_authors_from_commit_message(row["commit_message"])
+        for index, row in commit_data_df.iterrows()
+    ]
+    co_authors_df = pd.concat(co_authors_list, ignore_index=True)

+    # All co-authors must have an email address, so look up info and replace
+    # with whatever is on GitHub
+    for index, row in co_authors_df.iterrows():
+        user_data = get_user_data_from_email(row.email_address)
+        co_authors_df.loc[index, "username"] = user_data["username"]

-    commit_data.append(
-        {
-            "commit_message": commit_message,
-            "user_full_name": user_full_name,
-            "username": username,
-        }
-    )
-  commit_data_df = pd.DataFrame(commit_data)
-
-  username_to_fullname = {}
-  for username in usernames:
-      user_data = get_user_data_from_username(username)
-      username_to_fullname[username] = user_data['user_full_name']
-
-  co_authors_list = [get_co_authors_from_commit_message(row["commit_message"])
-                     for index, row in commit_data_df.iterrows()]
-  co_authors_df = pd.concat(co_authors_list, ignore_index=True)
-  co_authors_df.drop_duplicates(inplace=True)
-
-  # Merge the co-authors with the commit data
-  commit_data_df.drop(columns=["commit_message"], inplace=True)
-  commit_data_df = commit_data_df.merge(
-      co_authors_df,
-      how='outer',
-      on=['user_full_name', ])
-
-  # Remove rows where the username or user_full_name is in the EXCLUDED_USERS list in one line
-  commit_data_df = commit_data_df[
-    ~commit_data_df['username'].isin(EXCLUDED_USERS)
-    & ~commit_data_df['user_full_name'].isin(EXCLUDED_USERS)
+    # Remove excluded users
+    co_authors_df = co_authors_df[
+        ~co_authors_df["username"].isin(EXCLUDED_USERS)
+        & ~co_authors_df["user_full_name"].isin(EXCLUDED_USERS)
+    ]
+    commit_data_df = commit_data_df[
+        ~commit_data_df["username"].isin(EXCLUDED_USERS)
+        & ~commit_data_df["user_full_name"].isin(EXCLUDED_USERS)
    ]

-  # Before we drop duplicates, get the number of commits per user
-  commit_data_df = commit_data_df.assign(
-      commits=commit_data_df['user_full_name'].map(
-          commit_data_df['user_full_name'].value_counts()))
-  commit_data_df.drop_duplicates(inplace=True)
+    # Count contributions in each DataFrame
+    co_authors_df["co_author_count"] = co_authors_df.groupby("email_address")[
+        "email_address"
+    ].transform("count")

-  # Use the API to look up all user info
-  for index, row in commit_data_df.iterrows():
-    if not pd.isna(row.username):
-      user_data = get_user_data_from_username(row.username)
-      commit_data_df.loc[index, 'username'] = user_data['username']
+    # Create a combined key using username and email_address to handle
+    # cases with missing usernames. Users can commit without specifying
+    # their username, but they should have an email address.
+    commit_data_df["user_key"] = commit_data_df["username"].combine_first(
+        commit_data_df["email_address"]
+    )

-      if pd.isna(row.user_full_name) or (
-          row.user_full_name == row.username and not pd.isna(
-          user_data['user_full_name'])):
-        commit_data_df.loc[index, 'user_full_name'] = user_data[
-          'user_full_name']
-      if pd.isna(row.email_address):
-        commit_data_df.loc[index, 'email_address'] = user_data['email_address']
-    elif not pd.isna(row.email_address):
-      user_data = get_user_data_from_email(row.email_address)
-      commit_data_df.loc[index, 'email_address'] = user_data['email_address']
+    # Count the number of commits per user (grouped by user_key)
+    commit_data_df["commit_count"] = commit_data_df.groupby("user_key")[
+        "user_key"
+    ].transform("count")

-      if pd.isna(row.username):
-        commit_data_df.loc[index, 'username'] = user_data['username']
-      if pd.isna(row.user_full_name):
-        commit_data_df.loc[index, 'user_full_name'] = user_data[
-          'user_full_name']
-    else:
-      logging.error(f"Could not find user for row: {row}")
-  commit_data_df.drop_duplicates(inplace=True)
+    # Drop the user_key if it's no longer needed
+    commit_data_df.drop(columns=["user_key"], inplace=True)

-  # Get name length to figure out which full name to use
-  commit_data_df = commit_data_df.assign(
-      name_length=commit_data_df['user_full_name'].str.len())
-  commit_data_df = commit_data_df.fillna(pd.NA)
-  commit_data_df = commit_data_df.sort_values(by=['commits', 'name_length'],
-                                              ascending=False)
+    # Since we have the count, remove duplicates
+    commit_data_df = commit_data_df.drop(columns=["commit_message"])
+    co_authors_df.drop_duplicates(inplace=True)
+    commit_data_df.drop_duplicates(inplace=True)

-  # Add a flag column for whether 'username' is NaN
-  commit_data_df['has_username'] = ~commit_data_df['username'].isna()
+    # Now try to find all users with GitHub API
+    for index, row in commit_data_df.iterrows():
+        if not pd.isna(row["username"]):
+            user_data = get_user_data_from_username(row["username"])
+            if not pd.isna(user_data["username"]):
+                commit_data_df.loc[index, "user_full_name"] = user_data[
+                    "user_full_name"
+                ]
+            if not pd.isna(user_data["email_address"]):
+                commit_data_df.loc[index, "email_address"] = user_data["email_address"]
+        elif not pd.isna(row["email_address"]):
+            user_data = get_user_data_from_email(row["email_address"])
+            if not pd.isna(user_data["username"]):
+                commit_data_df.loc[index, "username"] = user_data["username"]
+            if not pd.isna(user_data["user_full_name"]):
+                commit_data_df.loc[index, "user_full_name"] = user_data[
+                    "user_full_name"
+                ]
+        else:
+            logging.error(
+                "Could not find user data for commit: " f"{row['commit_message']}"
+            )

-  # Multi-level group by 'has_username', 'username', and 'email_address'
-  commit_data_df = commit_data_df.groupby(
-      ['has_username', 'username', 'email_address'],
-      dropna=False,
-      as_index=False).first()
+    co_authors_with_username = co_authors_df[~co_authors_df["username"].isna()]
+    co_authors_without_username = co_authors_df[co_authors_df["username"].isna()]

-  # Drop the 'has_username' column as it's no longer needed after grouping
-  commit_data_df.drop('has_username', axis=1, inplace=True)
-  commit_data_df.drop('name_length', axis=1, inplace=True)
+    # First merge: on username
+    merged_df = co_authors_with_username.merge(
+        commit_data_df,
+        how="outer",
+        on=["username"],
+        suffixes=("_co", "_commit"),
+        indicator=True,
+    )

-  # If the user_full_name is an email address, replace it with the username
-  commit_data_df['user_full_name'] = commit_data_df.apply(
-      lambda row: row['username'] if '@' in row['user_full_name'] else row[
-        'user_full_name'],
-      axis=1)
-  
-  commit_data_df['user_full_name'] = commit_data_df.apply(
-  lambda row: username_to_fullname[row['username']] if row['username'] in username_to_fullname else row['user_full_name'], axis=1
-  )
+    # Calculate total contributions after first merge
+    merged_df["total_contributions"] = merged_df["co_author_count"].fillna(
+        0
+    ) + merged_df["commit_count"].fillna(0)

+    # Merge user full name columns
+    merged_df["user_full_name"] = merged_df["user_full_name"] = merged_df.apply(
+        merge_user_full_names,
+        col1="user_full_name_commit",
+        col2="user_full_name_co",
+        axis=1,
+    )

-  def generate_gravatar_url(name):
-    random.seed(name)
-    name_list = list(name)
-    random.shuffle(name_list)
-    name = ''.join(name_list)
-    name_hash = hashlib.md5(name.encode('utf-8')).hexdigest()
-    return f"https://www.gravatar.com/avatar/{name_hash}?d=identicon&s=100"
+    # Merge email address columns
+    merged_df["email_address"] = merged_df.apply(
+        merge_email_addresses,
+        col1="email_address_commit",
+        col2="email_address_co",
+        axis=1,
+    )

-  # Update avatar_url
-  commit_data_df['avatar_url'] = commit_data_df.apply(
-      lambda row: generate_gravatar_url(row['user_full_name']) if pd.isna(row[
-                                                                            'username']) else f"https://avatars.githubusercontent.com/{row['username']}",
-      axis=1)
+    # Drop unnecessary columns
+    merged_df = merged_df.drop(
+        columns=[
+            "_merge",
+            "co_author_count",
+            "commit_count",
+            "user_full_name_co",
+            "user_full_name_commit",
+            "email_address_co",
+            "email_address_commit",
+        ]
+    )
+    merged_df.drop_duplicates(inplace=True)

-  # Update profile URL
-  commit_data_df['profile'] = commit_data_df.apply(
-      lambda
-          row: "https://github.com/harvard-edge/cs249r_book/graphs/contributors" if pd.isna(
-          row['username']) else f"https://github.com/{row['username']}",
-      axis=1)
+    # Second merge: co-authors without username on email
+    merged_df = co_authors_without_username.merge(
+        merged_df,
+        how="outer",
+        on="email_address",
+        suffixes=("_co_no_user", ""),
+        indicator=True,
+    )

-  # Sort by number of commits
-  commit_data_df.sort_values(by='commits', ascending=False, inplace=True)
+    # Update total contributions after second merge
+    merged_df["total_contributions"] = merged_df["total_contributions"].fillna(
+        0
+    ) + merged_df["co_author_count"].fillna(0)

-  final_result = dict(
-      projectName=REPO,
-      projectOwner=OWNER,
-      files=["contents/contributors.qmd", "README.md"],
-      contributors=[
-          dict(
-              login=row.username,
-              name=row.user_full_name if not pd.isna(
-                  row.user_full_name) else row.username,
-              avatar_url=row.avatar_url,
-              profile=row.profile,
-              contributions=[],
-          )
-          for row in commit_data_df.itertuples()
-      ],
-      repoType="github",
-      contributorsPerLine=5,
-      repoHost="https://github.com",
-      commitConvention="angular",
-      skipCi=True,
-  )
+    # Merge user full name columns
+    merged_df["user_full_name"] = merged_df["user_full_name"] = merged_df.apply(
+        merge_user_full_names,
+        col1="user_full_name",
+        col2="user_full_name_co_no_user",
+        axis=1,
+    )

-  # Now, you can use final_result as needed
+    # Remove unnecessary columns
+    merged_df = merged_df.drop(
+        columns=["_merge", "co_author_count", "username_co_no_user"]
+    )

-  json_string = json.dumps(
-      final_result, indent=4
-  )
-  print(json_string)
+    # Merge the user full name columns
+    merged_df["user_full_name"] = merged_df.apply(
+        merge_user_full_names,
+        col1="user_full_name",
+        col2="user_full_name_co_no_user",
+        axis=1,
+    )
+    merged_df = merged_df.drop(columns=["user_full_name_co_no_user"])

-  with open(CONTRIBUTORS_FILE, "w") as contrib_file:
-    contrib_file.write(json_string)
+    # Get name length to figure out which full name to use
+    merged_df = merged_df.assign(name_length=merged_df["user_full_name"].str.len())
+    merged_df = merged_df.fillna(pd.NA)
+    merged_df = merged_df.sort_values(
+        by=["total_contributions", "name_length"], ascending=False
+    )
+
+    # Separate rows with and without usernames
+    df_with_username = merged_df.dropna(subset=["username"])
+    df_without_username = merged_df[merged_df["username"].isna()]
+
+    # Group by username, and take the user_full_name with the most characters for rows with usernames
+    df_with_username = df_with_username.groupby("username", as_index=False).first()
+
+    # Remove rows from df_without_username where the user_full_name matches a user_full_name in df_with_username.
+    # We do this to avoid duplicate entries for the same user. Without a
+    # username, we do not know if two rows are the same user.
+    df_without_username = df_without_username[
+        ~df_without_username["user_full_name"].isin(df_with_username["user_full_name"])
+    ]
+
+    # Combine the grouped rows with usernames and the original rows without usernames
+    merged_df = pd.concat([df_with_username, df_without_username], ignore_index=True)
+
+    def generate_gravatar_url(name):
+        random.seed(name)
+        name_list = list(name)
+        random.shuffle(name_list)
+        name = "".join(name_list)
+        name_hash = hashlib.md5(name.encode("utf-8")).hexdigest()
+        return f"https://www.gravatar.com/avatar/{name_hash}?d=identicon&s=100"
+
+    # Update avatar_url
+    merged_df["avatar_url"] = merged_df.apply(
+        lambda row: (
+            generate_gravatar_url(row["user_full_name"])
+            if pd.isna(row["username"])
+            else f"https://avatars.githubusercontent.com/{row['username']}"
+        ),
+        axis=1,
+    )
+
+    # Update profile URL
+    merged_df["profile"] = merged_df.apply(
+        lambda row: (
+            "https://github.com/harvard-edge/cs249r_book/graphs/contributors"
+            if pd.isna(row["username"])
+            else f"https://github.com/{row['username']}"
+        ),
+        axis=1,
+    )
+
+    # Sort by number of commits
+    merged_df.sort_values(by="total_contributions", ascending=False, inplace=True)
+
+    final_result = dict(
+        projectName=REPO,
+        projectOwner=OWNER,
+        files=["contents/contributors.qmd", "README.md"],
+        contributors=[
+            dict(
+                login=(
+                    row.username if not pd.isna(row.username) else row.user_full_name
+                ),
+                name=(
+                    row.user_full_name
+                    if not pd.isna(row.user_full_name)
+                    else row.username
+                ),
+                avatar_url=row.avatar_url,
+                profile=row.profile,
+                contributions=[],
+            )
+            for row in merged_df.itertuples()
+        ],
+        repoType="github",
+        contributorsPerLine=5,
+        repoHost="https://github.com",
+        commitConvention="angular",
+        skipCi=True,
+    )
+
+    json_string = json.dumps(final_result, indent=4)
+    print(json_string)
+
+    with open(CONTRIBUTORS_FILE, "w") as contrib_file:
+        contrib_file.write(json_string)


 if __name__ == "__main__":
-  app.run(main)
+    app.run(main)