You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

234 lines
6.2 KiB
Python

from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Any
USERS = [
{"id": 0, "name": "Hero"},
{"id": 1, "name": "Dunn"},
{"id": 2, "name": "Sue"},
{"id": 3, "name": "Chi"},
{"id": 4, "name": "Thor"},
{"id": 5, "name": "Clive"},
{"id": 6, "name": "Hicks"},
{"id": 7, "name": "Devin"},
{"id": 8, "name": "Kate"},
{"id": 9, "name": "Klein"},
]
FRIENDSHIPS = [
(0, 1),
(0, 2),
(1, 2),
(1, 3),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(5, 7),
(6, 8),
(7, 8),
(8, 9),
]
INTERETS = [
(0, "Hadoop"),
(0, "Big Data"),
(0, "HBase"),
(0, "Java"),
(0, "Spark"),
(0, "Storm"),
(0, "Cassandra"),
(1, "NoSQL"),
(1, "MongoDB"),
(1, "Cassandra"),
(1, "HBase"),
(1, "Postgres"),
(2, "Python"),
(2, "scikit-learn"),
(2, "scipy"),
(2, "numpy"),
(2, "statsmodels"),
(2, "pandas"),
(3, "R"),
(3, "Python"),
(3, "statistics"),
(3, "regression"),
(3, "probability"),
(4, "machine learning"),
(4, "regression"),
(4, "decision trees"),
(4, "libsvm"),
(5, "Python"),
(5, "R"),
(5, "Java"),
(5, "C++"),
(5, "Haskell"),
(5, "programming languages"),
(6, "statistics"),
(6, "probability"),
(6, "mathematics"),
(6, "theory"),
(7, "machine learning"),
(7, "scikit-learn"),
(7, "Mahout"),
(7, "neural networks"),
(8, "neural networks"),
(8, "deep learning"),
(8, "Big Data"),
(8, "artificial intelligence"),
(9, "Hadoop"),
(9, "Java"),
(9, "MapReduce"),
(9, "Big Data"),
]
def add_friendships(
users: List[Dict], friendships: List[Tuple[int, int]]
) -> List[Dict]:
for user in users:
user["friends"] = []
for i, j in friendships:
users[i]["friends"].append(users[j]) # add j as friend of i
users[j]["friends"].append(users[i]) # add j as friend of i
return users
def number_of_friends(user: Dict) -> int:
return len(user["friends"])
def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]:
"""Returns a sorted list by number of friends"""
number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
_sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True)
return _sorted_list
def friend_of_friend_ids_bad(user: Dict[str, Any]) -> List[int]:
return [foaf["id"] for friend in user["friends"] for foaf in friend["friends"]]
def find_friend_of_friends(user: Dict) -> List[int]:
"""Returns the unique list of ids for a user's friends of a friend."""
foaf = {
foaf.get("id")
for friend in user.get("friends")
for foaf in friend.get("friends")
}
difference_set = {friend.get("id") for friend in user.get("friends")}
difference_set.add(user.get("id"))
return list(foaf.difference(difference_set))
def not_the_same(user: Dict, other_user: Dict) -> bool:
"""tests for user equivalence.
If this was OOP this would have used __eq__
"""
return user.get("id") != other_user.get("id")
def not_friends(user: Dict, other_user: Dict) -> bool:
"""Tests for active friendship."""
return all(not_the_same(friend, other_user) for friend in user.get("friends"))
def friends_of_friends_ids(user: Dict[str, Any]) -> Counter:
return Counter(
foaf.get("id")
for friend in user.get("friends")
for foaf in friend.get("friends")
if not_the_same(user, foaf) and not_friends(user, foaf)
)
def data_scientist_who_like(target_interst: str) -> List[int]:
"""returns a list of DS ids if they have a particular target_interest.
This is not a very efficient function since it always has to search through.
The entire list of interests.
"""
return [user_id for user_id, interest in INTERETS if target_interst == interest]
def build_interest_to_user_index(
interests: List[Tuple[int, "str"]]
) -> Dict[str, List[int]]:
user_ids_by_interest = defaultdict(list)
for user_id, interest in interests:
user_ids_by_interest[interest].append(user_id)
return user_ids_by_interest
def build_user_to_interest_index(
interests: List[Tuple[int, "str"]]
) -> Dict[int, List[str]]:
interests_by_user_id = defaultdict(list)
for user_id, interest in interests:
interests_by_user_id[user_id].append(interest)
return interests_by_user_id
def most_common_interests_with(user, user_index, interest_index):
return Counter(
interested_user_id
for interest in user_index.get(user.get("id"))
for interested_user_id in interest_index.get(interest)
if user.get("id") != interested_user_id
)
# Write a function that will produce a count of users expressing an interest
def build_interest_counter(interests: List[Tuple[int, str]]) -> Counter:
return Counter(
word for user_id, interest in interests for word in interest.lower().split()
)
def print_counter(counter: Counter):
for word, count in counter.items():
print(f"{word}: {count}")
if __name__ == "__main__":
users = add_friendships(USERS, FRIENDSHIPS)
total_connections = sum(number_of_friends(user) for user in users)
num_users = len(users)
avg_connections = total_connections / num_users
counts = friend_counts(users)
heros_suggested_friends = friend_of_friend_ids_bad(users[0])
better_friend_suggestion = find_friend_of_friends(users[0])
chis_friend_suggestions_with_counts = friends_of_friends_ids(users[3])
print(f"Total user count: {num_users}")
print(f"Total number of connections: {total_connections}")
print(f"Average connections: {avg_connections}")
print(counts)
print(heros_suggested_friends)
print(better_friend_suggestion)
print(f"Chi's friend suggestions: {chis_friend_suggestions_with_counts}")
print("-" * 100)
user_idx = build_user_to_interest_index(INTERETS)
interest_idx = build_interest_to_user_index(INTERETS)
chis_common_interests = most_common_interests_with(users[3], user_idx, interest_idx)
print("Chi's most users with greatest interest overlap:")
print(chis_common_interests)
print("-" * 100)
interest_counter = build_interest_counter(INTERETS)
print_counter(interest_counter)