diff --git a/ch1/README.md b/ch1/README.md new file mode 100644 index 0000000..79efb4b --- /dev/null +++ b/ch1/README.md @@ -0,0 +1,19 @@ +# Chapter 1 + +## Take aways + +This chapter uses a fictitious first day as a Data Scientist to illustrate some social science research. In particular it illustrates the need to ask good questions of the data you have. + +For example. We had data on inter office friendships and professional interests from there we could determine: + +- Average number of friends +- Total count of inter office friendships +- A suggested list of people a person could befriend based on mutual friend ships (Friends of a friend) +- A suggested list of people a person could befriend based on mutual interest. + + +These examples continued with salary data, tenure, and paid accounts + +- At first we could explore with a visualization the relationship of salary and tenure. Resulting in us determining that individuals with longer tenures tend to earn more. +- Since tenure was widely distributed it was necessary to bucket individuals by tenure to get an average salary for a tenure range. + - This lead to the insight that data scientists with more than 5 years of experience earn 65% more than a data scientist with two years or less experience. diff --git a/ch1/__init__.py b/ch1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ch1/friends.py b/ch1/friends.py index 1b15484..7db3d4e 100644 --- a/ch1/friends.py +++ b/ch1/friends.py @@ -11,17 +11,29 @@ USERS = [ {"id": 7, "name": "Devin"}, {"id": 8, "name": "Kate"}, {"id": 9, "name": "Klein"}, - ] FRIENDSHIPS = [ - (0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9), + (0, 1), + (0, 2), + (1, 2), + (1, 3), + (2, 3), + (3, 4), + (4, 5), + (5, 6), + (5, 7), + (6, 8), + (7, 8), + (8, 9), ] -def add_friendships(users: List[Dict], friendships: List[Tuple[int, int]]) -> List[Dict]: +def add_friendships( + users: List[Dict], friendships: List[Tuple[int, int]] +) -> List[Dict]: for user in users: - user['friends'] = [] + user["friends"] = [] for i, j in friendships: users[i]["friends"].append(users[j]) # add j as friend of i @@ -36,11 +48,9 @@ def number_of_friends(user: Dict) -> int: def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]: """Returns a sorted list by number of friends""" - number_of_friends_by_id = [(user['id'], number_of_friends(user)) for user in users] + number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users] - _sorted_list = sorted(number_of_friends_by_id, - key=lambda tup: tup[1], - reverse=True) + _sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True) return _sorted_list diff --git a/ch1/friends2.py b/ch1/friends2.py new file mode 100644 index 0000000..418dfe3 --- /dev/null +++ b/ch1/friends2.py @@ -0,0 +1,233 @@ +from collections import Counter, defaultdict +from typing import List, Dict, Tuple, Any + +USERS = [ + {"id": 0, "name": "Hero"}, + {"id": 1, "name": "Dunn"}, + {"id": 2, "name": "Sue"}, + {"id": 3, "name": "Chi"}, + {"id": 4, "name": "Thor"}, + {"id": 5, "name": "Clive"}, + {"id": 6, "name": "Hicks"}, + {"id": 7, "name": "Devin"}, + {"id": 8, "name": "Kate"}, + {"id": 9, "name": "Klein"}, +] + +FRIENDSHIPS = [ + (0, 1), + (0, 2), + (1, 2), + (1, 3), + (2, 3), + (3, 4), + (4, 5), + (5, 6), + (5, 7), + (6, 8), + (7, 8), + (8, 9), +] + +INTERETS = [ + (0, "Hadoop"), + (0, "Big Data"), + (0, "HBase"), + (0, "Java"), + (0, "Spark"), + (0, "Storm"), + (0, "Cassandra"), + (1, "NoSQL"), + (1, "MongoDB"), + (1, "Cassandra"), + (1, "HBase"), + (1, "Postgres"), + (2, "Python"), + (2, "scikit-learn"), + (2, "scipy"), + (2, "numpy"), + (2, "statsmodels"), + (2, "pandas"), + (3, "R"), + (3, "Python"), + (3, "statistics"), + (3, "regression"), + (3, "probability"), + (4, "machine learning"), + (4, "regression"), + (4, "decision trees"), + (4, "libsvm"), + (5, "Python"), + (5, "R"), + (5, "Java"), + (5, "C++"), + (5, "Haskell"), + (5, "programming languages"), + (6, "statistics"), + (6, "probability"), + (6, "mathematics"), + (6, "theory"), + (7, "machine learning"), + (7, "scikit-learn"), + (7, "Mahout"), + (7, "neural networks"), + (8, "neural networks"), + (8, "deep learning"), + (8, "Big Data"), + (8, "artificial intelligence"), + (9, "Hadoop"), + (9, "Java"), + (9, "MapReduce"), + (9, "Big Data"), +] + + +def add_friendships( + users: List[Dict], friendships: List[Tuple[int, int]] +) -> List[Dict]: + for user in users: + user["friends"] = [] + + for i, j in friendships: + users[i]["friends"].append(users[j]) # add j as friend of i + users[j]["friends"].append(users[i]) # add j as friend of i + + return users + + +def number_of_friends(user: Dict) -> int: + return len(user["friends"]) + + +def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]: + """Returns a sorted list by number of friends""" + number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users] + + _sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True) + return _sorted_list + + +def friend_of_friend_ids_bad(user: Dict[str, Any]) -> List[int]: + return [foaf["id"] for friend in user["friends"] for foaf in friend["friends"]] + + +def find_friend_of_friends(user: Dict) -> List[int]: + """Returns the unique list of ids for a user's friends of a friend.""" + foaf = { + foaf.get("id") + for friend in user.get("friends") + for foaf in friend.get("friends") + } + + difference_set = {friend.get("id") for friend in user.get("friends")} + difference_set.add(user.get("id")) + + return list(foaf.difference(difference_set)) + + +def not_the_same(user: Dict, other_user: Dict) -> bool: + """tests for user equivalence. + + If this was OOP this would have used __eq__ + """ + return user.get("id") != other_user.get("id") + + +def not_friends(user: Dict, other_user: Dict) -> bool: + """Tests for active friendship.""" + return all(not_the_same(friend, other_user) for friend in user.get("friends")) + + +def friends_of_friends_ids(user: Dict[str, Any]) -> Counter: + return Counter( + foaf.get("id") + for friend in user.get("friends") + for foaf in friend.get("friends") + if not_the_same(user, foaf) and not_friends(user, foaf) + ) + + +def data_scientist_who_like(target_interst: str) -> List[int]: + """returns a list of DS ids if they have a particular target_interest. + + This is not a very efficient function since it always has to search through. + The entire list of interests. + """ + return [user_id for user_id, interest in INTERETS if target_interst == interest] + + +def build_interest_to_user_index( + interests: List[Tuple[int, "str"]] +) -> Dict[str, List[int]]: + user_ids_by_interest = defaultdict(list) + + for user_id, interest in interests: + user_ids_by_interest[interest].append(user_id) + + return user_ids_by_interest + + +def build_user_to_interest_index( + interests: List[Tuple[int, "str"]] +) -> Dict[int, List[str]]: + interests_by_user_id = defaultdict(list) + + for user_id, interest in interests: + interests_by_user_id[user_id].append(interest) + + return interests_by_user_id + + +def most_common_interests_with(user, user_index, interest_index): + return Counter( + interested_user_id + for interest in user_index.get(user.get("id")) + for interested_user_id in interest_index.get(interest) + if user.get("id") != interested_user_id + ) + + +# Write a function that will produce a count of users expressing an interest + + +def build_interest_counter(interests: List[Tuple[int, str]]) -> Counter: + return Counter( + word for user_id, interest in interests for word in interest.lower().split() + ) + + +def print_counter(counter: Counter): + for word, count in counter.items(): + print(f"{word}: {count}") + + +if __name__ == "__main__": + users = add_friendships(USERS, FRIENDSHIPS) + total_connections = sum(number_of_friends(user) for user in users) + num_users = len(users) + avg_connections = total_connections / num_users + counts = friend_counts(users) + heros_suggested_friends = friend_of_friend_ids_bad(users[0]) + better_friend_suggestion = find_friend_of_friends(users[0]) + + chis_friend_suggestions_with_counts = friends_of_friends_ids(users[3]) + + print(f"Total user count: {num_users}") + print(f"Total number of connections: {total_connections}") + print(f"Average connections: {avg_connections}") + print(counts) + print(heros_suggested_friends) + print(better_friend_suggestion) + print(f"Chi's friend suggestions: {chis_friend_suggestions_with_counts}") + + print("-" * 100) + user_idx = build_user_to_interest_index(INTERETS) + interest_idx = build_interest_to_user_index(INTERETS) + + chis_common_interests = most_common_interests_with(users[3], user_idx, interest_idx) + print("Chi's most users with greatest interest overlap:") + print(chis_common_interests) + + print("-" * 100) + interest_counter = build_interest_counter(INTERETS) + print_counter(interest_counter) diff --git a/ch1/oop_friends.py b/ch1/oop_friends.py new file mode 100644 index 0000000..25f82b8 --- /dev/null +++ b/ch1/oop_friends.py @@ -0,0 +1,28 @@ +""" +Created just to test the id of OOP for User representation. Kind of heavy on the memory usage side though. +""" + + +from typing import List + + +class User: + def __init__(self, id: int, name: str, freinds: List[int] = []) -> None: + self.id = id + self.name = name + self.friends = freinds + + def __eq__(self, other): + if isinstance(other, User): + return self.id == other.id + else: + return False + + def __repr__(self): + return f"User({self.id}, '{self.name}')" + + +class SuperUser(User): + def __init__(self, id: int, name: str, friends: List[int] = []): + super(SuperUser, self).__init__(id, name, friends) + self.admin = True diff --git a/ch3/__init__.py b/ch3/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ch3/bar_chart.py b/ch3/bar_chart.py new file mode 100644 index 0000000..3f1c24a --- /dev/null +++ b/ch3/bar_chart.py @@ -0,0 +1,18 @@ +from matplotlib import pyplot as plt + +movies = ["Annie Hall", "Ben-hur", "Casablanca", "Gandhi", "West Side Story"] + +num_oscars = [5, 11, 3, 8, 10] + +# Bars are by default 0.8 width. We are adding 0.1 to the left coordinates so that all bars are centered +xs = [i + 0.1 for i, _ in enumerate(movies)] + +# Plot the bars with left-x coordinates and heights (num_oscars) + +plt.bar(xs, num_oscars) +plt.ylabel("# of Oscars") +plt.title("Movies") + +# label x-axis with movie names at bar centers +plt.xticks([i + 0.1 for i, _ in enumerate(movies)], movies) +plt.show() diff --git a/ch3/line_plot.py b/ch3/line_plot.py new file mode 100644 index 0000000..3b6806c --- /dev/null +++ b/ch3/line_plot.py @@ -0,0 +1,11 @@ +from matplotlib import pyplot as plt + +years = range(1950, 2020, 10) +gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3] + +plt.plot(years, gdp, marker="o", color="green", linestyle="solid") + +plt.title("Nominal GDP") +plt.ylabel("Billions of $") + +plt.show() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4bdfd76 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +matplotlib==3.0.2 +black \ No newline at end of file