You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
234 lines
6.2 KiB
Python
234 lines
6.2 KiB
Python
6 years ago
|
from collections import Counter, defaultdict
|
||
|
from typing import List, Dict, Tuple, Any
|
||
|
|
||
|
USERS = [
|
||
|
{"id": 0, "name": "Hero"},
|
||
|
{"id": 1, "name": "Dunn"},
|
||
|
{"id": 2, "name": "Sue"},
|
||
|
{"id": 3, "name": "Chi"},
|
||
|
{"id": 4, "name": "Thor"},
|
||
|
{"id": 5, "name": "Clive"},
|
||
|
{"id": 6, "name": "Hicks"},
|
||
|
{"id": 7, "name": "Devin"},
|
||
|
{"id": 8, "name": "Kate"},
|
||
|
{"id": 9, "name": "Klein"},
|
||
|
]
|
||
|
|
||
|
FRIENDSHIPS = [
|
||
|
(0, 1),
|
||
|
(0, 2),
|
||
|
(1, 2),
|
||
|
(1, 3),
|
||
|
(2, 3),
|
||
|
(3, 4),
|
||
|
(4, 5),
|
||
|
(5, 6),
|
||
|
(5, 7),
|
||
|
(6, 8),
|
||
|
(7, 8),
|
||
|
(8, 9),
|
||
|
]
|
||
|
|
||
|
INTERETS = [
|
||
|
(0, "Hadoop"),
|
||
|
(0, "Big Data"),
|
||
|
(0, "HBase"),
|
||
|
(0, "Java"),
|
||
|
(0, "Spark"),
|
||
|
(0, "Storm"),
|
||
|
(0, "Cassandra"),
|
||
|
(1, "NoSQL"),
|
||
|
(1, "MongoDB"),
|
||
|
(1, "Cassandra"),
|
||
|
(1, "HBase"),
|
||
|
(1, "Postgres"),
|
||
|
(2, "Python"),
|
||
|
(2, "scikit-learn"),
|
||
|
(2, "scipy"),
|
||
|
(2, "numpy"),
|
||
|
(2, "statsmodels"),
|
||
|
(2, "pandas"),
|
||
|
(3, "R"),
|
||
|
(3, "Python"),
|
||
|
(3, "statistics"),
|
||
|
(3, "regression"),
|
||
|
(3, "probability"),
|
||
|
(4, "machine learning"),
|
||
|
(4, "regression"),
|
||
|
(4, "decision trees"),
|
||
|
(4, "libsvm"),
|
||
|
(5, "Python"),
|
||
|
(5, "R"),
|
||
|
(5, "Java"),
|
||
|
(5, "C++"),
|
||
|
(5, "Haskell"),
|
||
|
(5, "programming languages"),
|
||
|
(6, "statistics"),
|
||
|
(6, "probability"),
|
||
|
(6, "mathematics"),
|
||
|
(6, "theory"),
|
||
|
(7, "machine learning"),
|
||
|
(7, "scikit-learn"),
|
||
|
(7, "Mahout"),
|
||
|
(7, "neural networks"),
|
||
|
(8, "neural networks"),
|
||
|
(8, "deep learning"),
|
||
|
(8, "Big Data"),
|
||
|
(8, "artificial intelligence"),
|
||
|
(9, "Hadoop"),
|
||
|
(9, "Java"),
|
||
|
(9, "MapReduce"),
|
||
|
(9, "Big Data"),
|
||
|
]
|
||
|
|
||
|
|
||
|
def add_friendships(
|
||
|
users: List[Dict], friendships: List[Tuple[int, int]]
|
||
|
) -> List[Dict]:
|
||
|
for user in users:
|
||
|
user["friends"] = []
|
||
|
|
||
|
for i, j in friendships:
|
||
|
users[i]["friends"].append(users[j]) # add j as friend of i
|
||
|
users[j]["friends"].append(users[i]) # add j as friend of i
|
||
|
|
||
|
return users
|
||
|
|
||
|
|
||
|
def number_of_friends(user: Dict) -> int:
|
||
|
return len(user["friends"])
|
||
|
|
||
|
|
||
|
def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]:
|
||
|
"""Returns a sorted list by number of friends"""
|
||
|
number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
|
||
|
|
||
|
_sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True)
|
||
|
return _sorted_list
|
||
|
|
||
|
|
||
|
def friend_of_friend_ids_bad(user: Dict[str, Any]) -> List[int]:
|
||
|
return [foaf["id"] for friend in user["friends"] for foaf in friend["friends"]]
|
||
|
|
||
|
|
||
|
def find_friend_of_friends(user: Dict) -> List[int]:
|
||
|
"""Returns the unique list of ids for a user's friends of a friend."""
|
||
|
foaf = {
|
||
|
foaf.get("id")
|
||
|
for friend in user.get("friends")
|
||
|
for foaf in friend.get("friends")
|
||
|
}
|
||
|
|
||
|
difference_set = {friend.get("id") for friend in user.get("friends")}
|
||
|
difference_set.add(user.get("id"))
|
||
|
|
||
|
return list(foaf.difference(difference_set))
|
||
|
|
||
|
|
||
|
def not_the_same(user: Dict, other_user: Dict) -> bool:
|
||
|
"""tests for user equivalence.
|
||
|
|
||
|
If this was OOP this would have used __eq__
|
||
|
"""
|
||
|
return user.get("id") != other_user.get("id")
|
||
|
|
||
|
|
||
|
def not_friends(user: Dict, other_user: Dict) -> bool:
|
||
|
"""Tests for active friendship."""
|
||
|
return all(not_the_same(friend, other_user) for friend in user.get("friends"))
|
||
|
|
||
|
|
||
|
def friends_of_friends_ids(user: Dict[str, Any]) -> Counter:
|
||
|
return Counter(
|
||
|
foaf.get("id")
|
||
|
for friend in user.get("friends")
|
||
|
for foaf in friend.get("friends")
|
||
|
if not_the_same(user, foaf) and not_friends(user, foaf)
|
||
|
)
|
||
|
|
||
|
|
||
|
def data_scientist_who_like(target_interst: str) -> List[int]:
|
||
|
"""returns a list of DS ids if they have a particular target_interest.
|
||
|
|
||
|
This is not a very efficient function since it always has to search through.
|
||
|
The entire list of interests.
|
||
|
"""
|
||
|
return [user_id for user_id, interest in INTERETS if target_interst == interest]
|
||
|
|
||
|
|
||
|
def build_interest_to_user_index(
|
||
|
interests: List[Tuple[int, "str"]]
|
||
|
) -> Dict[str, List[int]]:
|
||
|
user_ids_by_interest = defaultdict(list)
|
||
|
|
||
|
for user_id, interest in interests:
|
||
|
user_ids_by_interest[interest].append(user_id)
|
||
|
|
||
|
return user_ids_by_interest
|
||
|
|
||
|
|
||
|
def build_user_to_interest_index(
|
||
|
interests: List[Tuple[int, "str"]]
|
||
|
) -> Dict[int, List[str]]:
|
||
|
interests_by_user_id = defaultdict(list)
|
||
|
|
||
|
for user_id, interest in interests:
|
||
|
interests_by_user_id[user_id].append(interest)
|
||
|
|
||
|
return interests_by_user_id
|
||
|
|
||
|
|
||
|
def most_common_interests_with(user, user_index, interest_index):
|
||
|
return Counter(
|
||
|
interested_user_id
|
||
|
for interest in user_index.get(user.get("id"))
|
||
|
for interested_user_id in interest_index.get(interest)
|
||
|
if user.get("id") != interested_user_id
|
||
|
)
|
||
|
|
||
|
|
||
|
# Write a function that will produce a count of users expressing an interest
|
||
|
|
||
|
|
||
|
def build_interest_counter(interests: List[Tuple[int, str]]) -> Counter:
|
||
|
return Counter(
|
||
|
word for user_id, interest in interests for word in interest.lower().split()
|
||
|
)
|
||
|
|
||
|
|
||
|
def print_counter(counter: Counter):
|
||
|
for word, count in counter.items():
|
||
|
print(f"{word}: {count}")
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
users = add_friendships(USERS, FRIENDSHIPS)
|
||
|
total_connections = sum(number_of_friends(user) for user in users)
|
||
|
num_users = len(users)
|
||
|
avg_connections = total_connections / num_users
|
||
|
counts = friend_counts(users)
|
||
|
heros_suggested_friends = friend_of_friend_ids_bad(users[0])
|
||
|
better_friend_suggestion = find_friend_of_friends(users[0])
|
||
|
|
||
|
chis_friend_suggestions_with_counts = friends_of_friends_ids(users[3])
|
||
|
|
||
|
print(f"Total user count: {num_users}")
|
||
|
print(f"Total number of connections: {total_connections}")
|
||
|
print(f"Average connections: {avg_connections}")
|
||
|
print(counts)
|
||
|
print(heros_suggested_friends)
|
||
|
print(better_friend_suggestion)
|
||
|
print(f"Chi's friend suggestions: {chis_friend_suggestions_with_counts}")
|
||
|
|
||
|
print("-" * 100)
|
||
|
user_idx = build_user_to_interest_index(INTERETS)
|
||
|
interest_idx = build_interest_to_user_index(INTERETS)
|
||
|
|
||
|
chis_common_interests = most_common_interests_with(users[3], user_idx, interest_idx)
|
||
|
print("Chi's most users with greatest interest overlap:")
|
||
|
print(chis_common_interests)
|
||
|
|
||
|
print("-" * 100)
|
||
|
interest_counter = build_interest_counter(INTERETS)
|
||
|
print_counter(interest_counter)
|