On to ch3

7 years ago · f8c316559e
parent 8af7b3915c
commit f8c316559e
9 changed files with 329 additions and 8 deletions
--- a/ch1/README.md
+++ b/ch1/README.md
@ -0,0 +1,19 @@
+# Chapter 1
+
+## Take aways
+
+This chapter uses a fictitious first day as a Data Scientist to illustrate some social science research. In particular it illustrates the need to ask good questions of the data you have.
+
+For example. We had data on inter office friendships and professional interests from there we could determine:
+
+- Average number of friends
+- Total count of inter office friendships
+- A suggested list of people a person could befriend based on mutual friend ships (Friends of a friend)
+- A suggested list of people a person could befriend based on mutual interest.
+
+
+These examples continued with salary data, tenure, and paid accounts
+
+- At first we could explore with a visualization the relationship of salary and tenure. Resulting in us determining that individuals with longer tenures tend to earn more.
+- Since tenure was widely distributed it was necessary to bucket individuals by tenure to get an average salary for a tenure range.
+  - This lead to the insight that data scientists with more than 5 years of experience earn 65% more than a data scientist with two years or less experience.
--- a/ch1/init.py
+++ b/ch1/init.py
--- a/ch1/friends.py
+++ b/ch1/friends.py
@ -11,17 +11,29 @@ USERS = [
    {"id": 7, "name": "Devin"},
    {"id": 8, "name": "Kate"},
    {"id": 9, "name": "Klein"},
-
 ]

 FRIENDSHIPS = [
-    (0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9),
+    (0, 1),
+    (0, 2),
+    (1, 2),
+    (1, 3),
+    (2, 3),
+    (3, 4),
+    (4, 5),
+    (5, 6),
+    (5, 7),
+    (6, 8),
+    (7, 8),
+    (8, 9),
 ]


-def add_friendships(users: List[Dict], friendships: List[Tuple[int, int]]) -> List[Dict]:
+def add_friendships(
+    users: List[Dict], friendships: List[Tuple[int, int]]
+) -> List[Dict]:
    for user in users:
-        user['friends'] = []
+        user["friends"] = []

    for i, j in friendships:
        users[i]["friends"].append(users[j])  # add j as friend of i
@ -36,11 +48,9 @@ def number_of_friends(user: Dict) -> int:

 def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]:
    """Returns a sorted list by number of friends"""
-    number_of_friends_by_id = [(user['id'], number_of_friends(user)) for user in users]
+    number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]

-    _sorted_list = sorted(number_of_friends_by_id,
-                          key=lambda tup: tup[1],
-                          reverse=True)
+    _sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True)
    return _sorted_list


--- a/ch1/friends2.py
+++ b/ch1/friends2.py
@ -0,0 +1,233 @@
+from collections import Counter, defaultdict
+from typing import List, Dict, Tuple, Any
+
+USERS = [
+    {"id": 0, "name": "Hero"},
+    {"id": 1, "name": "Dunn"},
+    {"id": 2, "name": "Sue"},
+    {"id": 3, "name": "Chi"},
+    {"id": 4, "name": "Thor"},
+    {"id": 5, "name": "Clive"},
+    {"id": 6, "name": "Hicks"},
+    {"id": 7, "name": "Devin"},
+    {"id": 8, "name": "Kate"},
+    {"id": 9, "name": "Klein"},
+]
+
+FRIENDSHIPS = [
+    (0, 1),
+    (0, 2),
+    (1, 2),
+    (1, 3),
+    (2, 3),
+    (3, 4),
+    (4, 5),
+    (5, 6),
+    (5, 7),
+    (6, 8),
+    (7, 8),
+    (8, 9),
+]
+
+INTERETS = [
+    (0, "Hadoop"),
+    (0, "Big Data"),
+    (0, "HBase"),
+    (0, "Java"),
+    (0, "Spark"),
+    (0, "Storm"),
+    (0, "Cassandra"),
+    (1, "NoSQL"),
+    (1, "MongoDB"),
+    (1, "Cassandra"),
+    (1, "HBase"),
+    (1, "Postgres"),
+    (2, "Python"),
+    (2, "scikit-learn"),
+    (2, "scipy"),
+    (2, "numpy"),
+    (2, "statsmodels"),
+    (2, "pandas"),
+    (3, "R"),
+    (3, "Python"),
+    (3, "statistics"),
+    (3, "regression"),
+    (3, "probability"),
+    (4, "machine learning"),
+    (4, "regression"),
+    (4, "decision trees"),
+    (4, "libsvm"),
+    (5, "Python"),
+    (5, "R"),
+    (5, "Java"),
+    (5, "C++"),
+    (5, "Haskell"),
+    (5, "programming languages"),
+    (6, "statistics"),
+    (6, "probability"),
+    (6, "mathematics"),
+    (6, "theory"),
+    (7, "machine learning"),
+    (7, "scikit-learn"),
+    (7, "Mahout"),
+    (7, "neural networks"),
+    (8, "neural networks"),
+    (8, "deep learning"),
+    (8, "Big Data"),
+    (8, "artificial intelligence"),
+    (9, "Hadoop"),
+    (9, "Java"),
+    (9, "MapReduce"),
+    (9, "Big Data"),
+]
+
+
+def add_friendships(
+    users: List[Dict], friendships: List[Tuple[int, int]]
+) -> List[Dict]:
+    for user in users:
+        user["friends"] = []
+
+    for i, j in friendships:
+        users[i]["friends"].append(users[j])  # add j as friend of i
+        users[j]["friends"].append(users[i])  # add j as friend of i
+
+    return users
+
+
+def number_of_friends(user: Dict) -> int:
+    return len(user["friends"])
+
+
+def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]:
+    """Returns a sorted list by number of friends"""
+    number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
+
+    _sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True)
+    return _sorted_list
+
+
+def friend_of_friend_ids_bad(user: Dict[str, Any]) -> List[int]:
+    return [foaf["id"] for friend in user["friends"] for foaf in friend["friends"]]
+
+
+def find_friend_of_friends(user: Dict) -> List[int]:
+    """Returns the unique list of ids for a user's friends of a friend."""
+    foaf = {
+        foaf.get("id")
+        for friend in user.get("friends")
+        for foaf in friend.get("friends")
+    }
+
+    difference_set = {friend.get("id") for friend in user.get("friends")}
+    difference_set.add(user.get("id"))
+
+    return list(foaf.difference(difference_set))
+
+
+def not_the_same(user: Dict, other_user: Dict) -> bool:
+    """tests for user equivalence.
+
+    If this was OOP this would have used __eq__
+    """
+    return user.get("id") != other_user.get("id")
+
+
+def not_friends(user: Dict, other_user: Dict) -> bool:
+    """Tests for active friendship."""
+    return all(not_the_same(friend, other_user) for friend in user.get("friends"))
+
+
+def friends_of_friends_ids(user: Dict[str, Any]) -> Counter:
+    return Counter(
+        foaf.get("id")
+        for friend in user.get("friends")
+        for foaf in friend.get("friends")
+        if not_the_same(user, foaf) and not_friends(user, foaf)
+    )
+
+
+def data_scientist_who_like(target_interst: str) -> List[int]:
+    """returns a list of DS ids if they have a particular target_interest.
+
+    This is not a very efficient function since it always has to search through.
+    The entire list of interests.
+    """
+    return [user_id for user_id, interest in INTERETS if target_interst == interest]
+
+
+def build_interest_to_user_index(
+    interests: List[Tuple[int, "str"]]
+) -> Dict[str, List[int]]:
+    user_ids_by_interest = defaultdict(list)
+
+    for user_id, interest in interests:
+        user_ids_by_interest[interest].append(user_id)
+
+    return user_ids_by_interest
+
+
+def build_user_to_interest_index(
+    interests: List[Tuple[int, "str"]]
+) -> Dict[int, List[str]]:
+    interests_by_user_id = defaultdict(list)
+
+    for user_id, interest in interests:
+        interests_by_user_id[user_id].append(interest)
+
+    return interests_by_user_id
+
+
+def most_common_interests_with(user, user_index, interest_index):
+    return Counter(
+        interested_user_id
+        for interest in user_index.get(user.get("id"))
+        for interested_user_id in interest_index.get(interest)
+        if user.get("id") != interested_user_id
+    )
+
+
+# Write a function that will produce a count of users expressing an interest
+
+
+def build_interest_counter(interests: List[Tuple[int, str]]) -> Counter:
+    return Counter(
+        word for user_id, interest in interests for word in interest.lower().split()
+    )
+
+
+def print_counter(counter: Counter):
+    for word, count in counter.items():
+        print(f"{word}: {count}")
+
+
+if __name__ == "__main__":
+    users = add_friendships(USERS, FRIENDSHIPS)
+    total_connections = sum(number_of_friends(user) for user in users)
+    num_users = len(users)
+    avg_connections = total_connections / num_users
+    counts = friend_counts(users)
+    heros_suggested_friends = friend_of_friend_ids_bad(users[0])
+    better_friend_suggestion = find_friend_of_friends(users[0])
+
+    chis_friend_suggestions_with_counts = friends_of_friends_ids(users[3])
+
+    print(f"Total user count: {num_users}")
+    print(f"Total number of connections: {total_connections}")
+    print(f"Average connections: {avg_connections}")
+    print(counts)
+    print(heros_suggested_friends)
+    print(better_friend_suggestion)
+    print(f"Chi's friend suggestions: {chis_friend_suggestions_with_counts}")
+
+    print("-" * 100)
+    user_idx = build_user_to_interest_index(INTERETS)
+    interest_idx = build_interest_to_user_index(INTERETS)
+
+    chis_common_interests = most_common_interests_with(users[3], user_idx, interest_idx)
+    print("Chi's most users with greatest interest overlap:")
+    print(chis_common_interests)
+
+    print("-" * 100)
+    interest_counter = build_interest_counter(INTERETS)
+    print_counter(interest_counter)
--- a/ch1/oop_friends.py
+++ b/ch1/oop_friends.py
@ -0,0 +1,28 @@
+"""
+Created just to test the id of OOP for User representation. Kind of heavy on the memory usage side though.
+"""
+
+
+from typing import List
+
+
+class User:
+    def __init__(self, id: int, name: str, freinds: List[int] = []) -> None:
+        self.id = id
+        self.name = name
+        self.friends = freinds
+
+    def __eq__(self, other):
+        if isinstance(other, User):
+            return self.id == other.id
+        else:
+            return False
+
+    def __repr__(self):
+        return f"User({self.id}, '{self.name}')"
+
+
+class SuperUser(User):
+    def __init__(self, id: int, name: str, friends: List[int] = []):
+        super(SuperUser, self).__init__(id, name, friends)
+        self.admin = True
--- a/ch3/init.py
+++ b/ch3/init.py
--- a/ch3/bar_chart.py
+++ b/ch3/bar_chart.py
@ -0,0 +1,18 @@
+from matplotlib import pyplot as plt
+
+movies = ["Annie Hall", "Ben-hur", "Casablanca", "Gandhi", "West Side Story"]
+
+num_oscars = [5, 11, 3, 8, 10]
+
+# Bars are by default 0.8 width. We are adding 0.1 to the left coordinates so that all bars are centered
+xs = [i + 0.1 for i, _ in enumerate(movies)]
+
+# Plot the bars with left-x coordinates and heights (num_oscars)
+
+plt.bar(xs, num_oscars)
+plt.ylabel("# of Oscars")
+plt.title("Movies")
+
+# label x-axis with movie names at bar centers
+plt.xticks([i + 0.1 for i, _ in enumerate(movies)], movies)
+plt.show()
--- a/ch3/line_plot.py
+++ b/ch3/line_plot.py
@ -0,0 +1,11 @@
+from matplotlib import pyplot as plt
+
+years = range(1950, 2020, 10)
+gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3]
+
+plt.plot(years, gdp, marker="o", color="green", linestyle="solid")
+
+plt.title("Nominal GDP")
+plt.ylabel("Billions of $")
+
+plt.show()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+matplotlib==3.0.2
+black