data_science_from_scratch/ch1/friends2.py

from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Any

USERS = [
    {"id": 0, "name": "Hero"},
    {"id": 1, "name": "Dunn"},
    {"id": 2, "name": "Sue"},
    {"id": 3, "name": "Chi"},
    {"id": 4, "name": "Thor"},
    {"id": 5, "name": "Clive"},
    {"id": 6, "name": "Hicks"},
    {"id": 7, "name": "Devin"},
    {"id": 8, "name": "Kate"},
    {"id": 9, "name": "Klein"},
]

FRIENDSHIPS = [
    (0, 1),
    (0, 2),
    (1, 2),
    (1, 3),
    (2, 3),
    (3, 4),
    (4, 5),
    (5, 6),
    (5, 7),
    (6, 8),
    (7, 8),
    (8, 9),
]

INTERETS = [
    (0, "Hadoop"),
    (0, "Big Data"),
    (0, "HBase"),
    (0, "Java"),
    (0, "Spark"),
    (0, "Storm"),
    (0, "Cassandra"),
    (1, "NoSQL"),
    (1, "MongoDB"),
    (1, "Cassandra"),
    (1, "HBase"),
    (1, "Postgres"),
    (2, "Python"),
    (2, "scikit-learn"),
    (2, "scipy"),
    (2, "numpy"),
    (2, "statsmodels"),
    (2, "pandas"),
    (3, "R"),
    (3, "Python"),
    (3, "statistics"),
    (3, "regression"),
    (3, "probability"),
    (4, "machine learning"),
    (4, "regression"),
    (4, "decision trees"),
    (4, "libsvm"),
    (5, "Python"),
    (5, "R"),
    (5, "Java"),
    (5, "C++"),
    (5, "Haskell"),
    (5, "programming languages"),
    (6, "statistics"),
    (6, "probability"),
    (6, "mathematics"),
    (6, "theory"),
    (7, "machine learning"),
    (7, "scikit-learn"),
    (7, "Mahout"),
    (7, "neural networks"),
    (8, "neural networks"),
    (8, "deep learning"),
    (8, "Big Data"),
    (8, "artificial intelligence"),
    (9, "Hadoop"),
    (9, "Java"),
    (9, "MapReduce"),
    (9, "Big Data"),
]


def add_friendships(
    users: List[Dict], friendships: List[Tuple[int, int]]
) -> List[Dict]:
    for user in users:
        user["friends"] = []

    for i, j in friendships:
        users[i]["friends"].append(users[j])  # add j as friend of i
        users[j]["friends"].append(users[i])  # add j as friend of i

    return users


def number_of_friends(user: Dict) -> int:
    return len(user["friends"])


def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]:
    """Returns a sorted list by number of friends"""
    number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]

    _sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True)
    return _sorted_list


def friend_of_friend_ids_bad(user: Dict[str, Any]) -> List[int]:
    return [foaf["id"] for friend in user["friends"] for foaf in friend["friends"]]


def find_friend_of_friends(user: Dict) -> List[int]:
    """Returns the unique list of ids for a user's friends of a friend."""
    foaf = {
        foaf.get("id")
        for friend in user.get("friends")
        for foaf in friend.get("friends")
    }

    difference_set = {friend.get("id") for friend in user.get("friends")}
    difference_set.add(user.get("id"))

    return list(foaf.difference(difference_set))


def not_the_same(user: Dict, other_user: Dict) -> bool:
    """tests for user equivalence.

    If this was OOP this would have used __eq__
    """
    return user.get("id") != other_user.get("id")


def not_friends(user: Dict, other_user: Dict) -> bool:
    """Tests for active friendship."""
    return all(not_the_same(friend, other_user) for friend in user.get("friends"))


def friends_of_friends_ids(user: Dict[str, Any]) -> Counter:
    return Counter(
        foaf.get("id")
        for friend in user.get("friends")
        for foaf in friend.get("friends")
        if not_the_same(user, foaf) and not_friends(user, foaf)
    )


def data_scientist_who_like(target_interst: str) -> List[int]:
    """returns a list of DS ids if they have a particular target_interest.

    This is not a very efficient function since it always has to search through.
    The entire list of interests.
    """
    return [user_id for user_id, interest in INTERETS if target_interst == interest]


def build_interest_to_user_index(
    interests: List[Tuple[int, "str"]]
) -> Dict[str, List[int]]:
    user_ids_by_interest = defaultdict(list)

    for user_id, interest in interests:
        user_ids_by_interest[interest].append(user_id)

    return user_ids_by_interest


def build_user_to_interest_index(
    interests: List[Tuple[int, "str"]]
) -> Dict[int, List[str]]:
    interests_by_user_id = defaultdict(list)

    for user_id, interest in interests:
        interests_by_user_id[user_id].append(interest)

    return interests_by_user_id


def most_common_interests_with(user, user_index, interest_index):
    return Counter(
        interested_user_id
        for interest in user_index.get(user.get("id"))
        for interested_user_id in interest_index.get(interest)
        if user.get("id") != interested_user_id
    )


# Write a function that will produce a count of users expressing an interest


def build_interest_counter(interests: List[Tuple[int, str]]) -> Counter:
    return Counter(
        word for user_id, interest in interests for word in interest.lower().split()
    )


def print_counter(counter: Counter):
    for word, count in counter.items():
        print(f"{word}: {count}")


if __name__ == "__main__":
    users = add_friendships(USERS, FRIENDSHIPS)
    total_connections = sum(number_of_friends(user) for user in users)
    num_users = len(users)
    avg_connections = total_connections / num_users
    counts = friend_counts(users)
    heros_suggested_friends = friend_of_friend_ids_bad(users[0])
    better_friend_suggestion = find_friend_of_friends(users[0])

    chis_friend_suggestions_with_counts = friends_of_friends_ids(users[3])

    print(f"Total user count: {num_users}")
    print(f"Total number of connections: {total_connections}")
    print(f"Average connections: {avg_connections}")
    print(counts)
    print(heros_suggested_friends)
    print(better_friend_suggestion)
    print(f"Chi's friend suggestions: {chis_friend_suggestions_with_counts}")

    print("-" * 100)
    user_idx = build_user_to_interest_index(INTERETS)
    interest_idx = build_interest_to_user_index(INTERETS)

    chis_common_interests = most_common_interests_with(users[3], user_idx, interest_idx)
    print("Chi's most users with greatest interest overlap:")
    print(chis_common_interests)

    print("-" * 100)
    interest_counter = build_interest_counter(INTERETS)
    print_counter(interest_counter)
On to ch3 6 years ago			`from collections import Counter, defaultdict`
			`from typing import List, Dict, Tuple, Any`

			`USERS = [`
			`{"id": 0, "name": "Hero"},`
			`{"id": 1, "name": "Dunn"},`
			`{"id": 2, "name": "Sue"},`
			`{"id": 3, "name": "Chi"},`
			`{"id": 4, "name": "Thor"},`
			`{"id": 5, "name": "Clive"},`
			`{"id": 6, "name": "Hicks"},`
			`{"id": 7, "name": "Devin"},`
			`{"id": 8, "name": "Kate"},`
			`{"id": 9, "name": "Klein"},`
			`]`

			`FRIENDSHIPS = [`
			`(0, 1),`
			`(0, 2),`
			`(1, 2),`
			`(1, 3),`
			`(2, 3),`
			`(3, 4),`
			`(4, 5),`
			`(5, 6),`
			`(5, 7),`
			`(6, 8),`
			`(7, 8),`
			`(8, 9),`
			`]`

			`INTERETS = [`
			`(0, "Hadoop"),`
			`(0, "Big Data"),`
			`(0, "HBase"),`
			`(0, "Java"),`
			`(0, "Spark"),`
			`(0, "Storm"),`
			`(0, "Cassandra"),`
			`(1, "NoSQL"),`
			`(1, "MongoDB"),`
			`(1, "Cassandra"),`
			`(1, "HBase"),`
			`(1, "Postgres"),`
			`(2, "Python"),`
			`(2, "scikit-learn"),`
			`(2, "scipy"),`
			`(2, "numpy"),`
			`(2, "statsmodels"),`
			`(2, "pandas"),`
			`(3, "R"),`
			`(3, "Python"),`
			`(3, "statistics"),`
			`(3, "regression"),`
			`(3, "probability"),`
			`(4, "machine learning"),`
			`(4, "regression"),`
			`(4, "decision trees"),`
			`(4, "libsvm"),`
			`(5, "Python"),`
			`(5, "R"),`
			`(5, "Java"),`
			`(5, "C++"),`
			`(5, "Haskell"),`
			`(5, "programming languages"),`
			`(6, "statistics"),`
			`(6, "probability"),`
			`(6, "mathematics"),`
			`(6, "theory"),`
			`(7, "machine learning"),`
			`(7, "scikit-learn"),`
			`(7, "Mahout"),`
			`(7, "neural networks"),`
			`(8, "neural networks"),`
			`(8, "deep learning"),`
			`(8, "Big Data"),`
			`(8, "artificial intelligence"),`
			`(9, "Hadoop"),`
			`(9, "Java"),`
			`(9, "MapReduce"),`
			`(9, "Big Data"),`
			`]`


			`def add_friendships(`
			`users: List[Dict], friendships: List[Tuple[int, int]]`
			`) -> List[Dict]:`
			`for user in users:`
			`user["friends"] = []`

			`for i, j in friendships:`
			`users[i]["friends"].append(users[j]) # add j as friend of i`
			`users[j]["friends"].append(users[i]) # add j as friend of i`

			`return users`


			`def number_of_friends(user: Dict) -> int:`
			`return len(user["friends"])`


			`def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]:`
			`"""Returns a sorted list by number of friends"""`
			`number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]`

			`_sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True)`
			`return _sorted_list`


			`def friend_of_friend_ids_bad(user: Dict[str, Any]) -> List[int]:`
			`return [foaf["id"] for friend in user["friends"] for foaf in friend["friends"]]`


			`def find_friend_of_friends(user: Dict) -> List[int]:`
			`"""Returns the unique list of ids for a user's friends of a friend."""`
			`foaf = {`
			`foaf.get("id")`
			`for friend in user.get("friends")`
			`for foaf in friend.get("friends")`
			`}`

			`difference_set = {friend.get("id") for friend in user.get("friends")}`
			`difference_set.add(user.get("id"))`

			`return list(foaf.difference(difference_set))`


			`def not_the_same(user: Dict, other_user: Dict) -> bool:`
			`"""tests for user equivalence.`

			`If this was OOP this would have used __eq__`
			`"""`
			`return user.get("id") != other_user.get("id")`


			`def not_friends(user: Dict, other_user: Dict) -> bool:`
			`"""Tests for active friendship."""`
			`return all(not_the_same(friend, other_user) for friend in user.get("friends"))`


			`def friends_of_friends_ids(user: Dict[str, Any]) -> Counter:`
			`return Counter(`
			`foaf.get("id")`
			`for friend in user.get("friends")`
			`for foaf in friend.get("friends")`
			`if not_the_same(user, foaf) and not_friends(user, foaf)`
			`)`


			`def data_scientist_who_like(target_interst: str) -> List[int]:`
			`"""returns a list of DS ids if they have a particular target_interest.`

			`This is not a very efficient function since it always has to search through.`
			`The entire list of interests.`
			`"""`
			`return [user_id for user_id, interest in INTERETS if target_interst == interest]`


			`def build_interest_to_user_index(`
			`interests: List[Tuple[int, "str"]]`
			`) -> Dict[str, List[int]]:`
			`user_ids_by_interest = defaultdict(list)`

			`for user_id, interest in interests:`
			`user_ids_by_interest[interest].append(user_id)`

			`return user_ids_by_interest`


			`def build_user_to_interest_index(`
			`interests: List[Tuple[int, "str"]]`
			`) -> Dict[int, List[str]]:`
			`interests_by_user_id = defaultdict(list)`

			`for user_id, interest in interests:`
			`interests_by_user_id[user_id].append(interest)`

			`return interests_by_user_id`


			`def most_common_interests_with(user, user_index, interest_index):`
			`return Counter(`
			`interested_user_id`
			`for interest in user_index.get(user.get("id"))`
			`for interested_user_id in interest_index.get(interest)`
			`if user.get("id") != interested_user_id`
			`)`


			`# Write a function that will produce a count of users expressing an interest`


			`def build_interest_counter(interests: List[Tuple[int, str]]) -> Counter:`
			`return Counter(`
			`word for user_id, interest in interests for word in interest.lower().split()`
			`)`


			`def print_counter(counter: Counter):`
			`for word, count in counter.items():`
			`print(f"{word}: {count}")`


			`if __name__ == "__main__":`
			`users = add_friendships(USERS, FRIENDSHIPS)`
			`total_connections = sum(number_of_friends(user) for user in users)`
			`num_users = len(users)`
			`avg_connections = total_connections / num_users`
			`counts = friend_counts(users)`
			`heros_suggested_friends = friend_of_friend_ids_bad(users[0])`
			`better_friend_suggestion = find_friend_of_friends(users[0])`

			`chis_friend_suggestions_with_counts = friends_of_friends_ids(users[3])`

			`print(f"Total user count: {num_users}")`
			`print(f"Total number of connections: {total_connections}")`
			`print(f"Average connections: {avg_connections}")`
			`print(counts)`
			`print(heros_suggested_friends)`
			`print(better_friend_suggestion)`
			`print(f"Chi's friend suggestions: {chis_friend_suggestions_with_counts}")`

			`print("-" * 100)`
			`user_idx = build_user_to_interest_index(INTERETS)`
			`interest_idx = build_interest_to_user_index(INTERETS)`

			`chis_common_interests = most_common_interests_with(users[3], user_idx, interest_idx)`
			`print("Chi's most users with greatest interest overlap:")`
			`print(chis_common_interests)`

			`print("-" * 100)`
			`interest_counter = build_interest_counter(INTERETS)`
			`print_counter(interest_counter)`