On to ch3

master
androiddrew 6 years ago
parent 8af7b3915c
commit f8c316559e

@ -0,0 +1,19 @@
# Chapter 1
## Take aways
This chapter uses a fictitious first day as a Data Scientist to illustrate some social science research. In particular it illustrates the need to ask good questions of the data you have.
For example. We had data on inter office friendships and professional interests from there we could determine:
- Average number of friends
- Total count of inter office friendships
- A suggested list of people a person could befriend based on mutual friend ships (Friends of a friend)
- A suggested list of people a person could befriend based on mutual interest.
These examples continued with salary data, tenure, and paid accounts
- At first we could explore with a visualization the relationship of salary and tenure. Resulting in us determining that individuals with longer tenures tend to earn more.
- Since tenure was widely distributed it was necessary to bucket individuals by tenure to get an average salary for a tenure range.
- This lead to the insight that data scientists with more than 5 years of experience earn 65% more than a data scientist with two years or less experience.

@ -11,17 +11,29 @@ USERS = [
{"id": 7, "name": "Devin"}, {"id": 7, "name": "Devin"},
{"id": 8, "name": "Kate"}, {"id": 8, "name": "Kate"},
{"id": 9, "name": "Klein"}, {"id": 9, "name": "Klein"},
] ]
FRIENDSHIPS = [ FRIENDSHIPS = [
(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9), (0, 1),
(0, 2),
(1, 2),
(1, 3),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(5, 7),
(6, 8),
(7, 8),
(8, 9),
] ]
def add_friendships(users: List[Dict], friendships: List[Tuple[int, int]]) -> List[Dict]: def add_friendships(
users: List[Dict], friendships: List[Tuple[int, int]]
) -> List[Dict]:
for user in users: for user in users:
user['friends'] = [] user["friends"] = []
for i, j in friendships: for i, j in friendships:
users[i]["friends"].append(users[j]) # add j as friend of i users[i]["friends"].append(users[j]) # add j as friend of i
@ -36,11 +48,9 @@ def number_of_friends(user: Dict) -> int:
def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]: def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]:
"""Returns a sorted list by number of friends""" """Returns a sorted list by number of friends"""
number_of_friends_by_id = [(user['id'], number_of_friends(user)) for user in users] number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
_sorted_list = sorted(number_of_friends_by_id, _sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True)
key=lambda tup: tup[1],
reverse=True)
return _sorted_list return _sorted_list

@ -0,0 +1,233 @@
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Any
USERS = [
{"id": 0, "name": "Hero"},
{"id": 1, "name": "Dunn"},
{"id": 2, "name": "Sue"},
{"id": 3, "name": "Chi"},
{"id": 4, "name": "Thor"},
{"id": 5, "name": "Clive"},
{"id": 6, "name": "Hicks"},
{"id": 7, "name": "Devin"},
{"id": 8, "name": "Kate"},
{"id": 9, "name": "Klein"},
]
FRIENDSHIPS = [
(0, 1),
(0, 2),
(1, 2),
(1, 3),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(5, 7),
(6, 8),
(7, 8),
(8, 9),
]
INTERETS = [
(0, "Hadoop"),
(0, "Big Data"),
(0, "HBase"),
(0, "Java"),
(0, "Spark"),
(0, "Storm"),
(0, "Cassandra"),
(1, "NoSQL"),
(1, "MongoDB"),
(1, "Cassandra"),
(1, "HBase"),
(1, "Postgres"),
(2, "Python"),
(2, "scikit-learn"),
(2, "scipy"),
(2, "numpy"),
(2, "statsmodels"),
(2, "pandas"),
(3, "R"),
(3, "Python"),
(3, "statistics"),
(3, "regression"),
(3, "probability"),
(4, "machine learning"),
(4, "regression"),
(4, "decision trees"),
(4, "libsvm"),
(5, "Python"),
(5, "R"),
(5, "Java"),
(5, "C++"),
(5, "Haskell"),
(5, "programming languages"),
(6, "statistics"),
(6, "probability"),
(6, "mathematics"),
(6, "theory"),
(7, "machine learning"),
(7, "scikit-learn"),
(7, "Mahout"),
(7, "neural networks"),
(8, "neural networks"),
(8, "deep learning"),
(8, "Big Data"),
(8, "artificial intelligence"),
(9, "Hadoop"),
(9, "Java"),
(9, "MapReduce"),
(9, "Big Data"),
]
def add_friendships(
users: List[Dict], friendships: List[Tuple[int, int]]
) -> List[Dict]:
for user in users:
user["friends"] = []
for i, j in friendships:
users[i]["friends"].append(users[j]) # add j as friend of i
users[j]["friends"].append(users[i]) # add j as friend of i
return users
def number_of_friends(user: Dict) -> int:
return len(user["friends"])
def friend_counts(users: List[Dict]) -> List[Tuple[int, int]]:
"""Returns a sorted list by number of friends"""
number_of_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
_sorted_list = sorted(number_of_friends_by_id, key=lambda tup: tup[1], reverse=True)
return _sorted_list
def friend_of_friend_ids_bad(user: Dict[str, Any]) -> List[int]:
return [foaf["id"] for friend in user["friends"] for foaf in friend["friends"]]
def find_friend_of_friends(user: Dict) -> List[int]:
"""Returns the unique list of ids for a user's friends of a friend."""
foaf = {
foaf.get("id")
for friend in user.get("friends")
for foaf in friend.get("friends")
}
difference_set = {friend.get("id") for friend in user.get("friends")}
difference_set.add(user.get("id"))
return list(foaf.difference(difference_set))
def not_the_same(user: Dict, other_user: Dict) -> bool:
"""tests for user equivalence.
If this was OOP this would have used __eq__
"""
return user.get("id") != other_user.get("id")
def not_friends(user: Dict, other_user: Dict) -> bool:
"""Tests for active friendship."""
return all(not_the_same(friend, other_user) for friend in user.get("friends"))
def friends_of_friends_ids(user: Dict[str, Any]) -> Counter:
return Counter(
foaf.get("id")
for friend in user.get("friends")
for foaf in friend.get("friends")
if not_the_same(user, foaf) and not_friends(user, foaf)
)
def data_scientist_who_like(target_interst: str) -> List[int]:
"""returns a list of DS ids if they have a particular target_interest.
This is not a very efficient function since it always has to search through.
The entire list of interests.
"""
return [user_id for user_id, interest in INTERETS if target_interst == interest]
def build_interest_to_user_index(
interests: List[Tuple[int, "str"]]
) -> Dict[str, List[int]]:
user_ids_by_interest = defaultdict(list)
for user_id, interest in interests:
user_ids_by_interest[interest].append(user_id)
return user_ids_by_interest
def build_user_to_interest_index(
interests: List[Tuple[int, "str"]]
) -> Dict[int, List[str]]:
interests_by_user_id = defaultdict(list)
for user_id, interest in interests:
interests_by_user_id[user_id].append(interest)
return interests_by_user_id
def most_common_interests_with(user, user_index, interest_index):
return Counter(
interested_user_id
for interest in user_index.get(user.get("id"))
for interested_user_id in interest_index.get(interest)
if user.get("id") != interested_user_id
)
# Write a function that will produce a count of users expressing an interest
def build_interest_counter(interests: List[Tuple[int, str]]) -> Counter:
return Counter(
word for user_id, interest in interests for word in interest.lower().split()
)
def print_counter(counter: Counter):
for word, count in counter.items():
print(f"{word}: {count}")
if __name__ == "__main__":
users = add_friendships(USERS, FRIENDSHIPS)
total_connections = sum(number_of_friends(user) for user in users)
num_users = len(users)
avg_connections = total_connections / num_users
counts = friend_counts(users)
heros_suggested_friends = friend_of_friend_ids_bad(users[0])
better_friend_suggestion = find_friend_of_friends(users[0])
chis_friend_suggestions_with_counts = friends_of_friends_ids(users[3])
print(f"Total user count: {num_users}")
print(f"Total number of connections: {total_connections}")
print(f"Average connections: {avg_connections}")
print(counts)
print(heros_suggested_friends)
print(better_friend_suggestion)
print(f"Chi's friend suggestions: {chis_friend_suggestions_with_counts}")
print("-" * 100)
user_idx = build_user_to_interest_index(INTERETS)
interest_idx = build_interest_to_user_index(INTERETS)
chis_common_interests = most_common_interests_with(users[3], user_idx, interest_idx)
print("Chi's most users with greatest interest overlap:")
print(chis_common_interests)
print("-" * 100)
interest_counter = build_interest_counter(INTERETS)
print_counter(interest_counter)

@ -0,0 +1,28 @@
"""
Created just to test the id of OOP for User representation. Kind of heavy on the memory usage side though.
"""
from typing import List
class User:
def __init__(self, id: int, name: str, freinds: List[int] = []) -> None:
self.id = id
self.name = name
self.friends = freinds
def __eq__(self, other):
if isinstance(other, User):
return self.id == other.id
else:
return False
def __repr__(self):
return f"User({self.id}, '{self.name}')"
class SuperUser(User):
def __init__(self, id: int, name: str, friends: List[int] = []):
super(SuperUser, self).__init__(id, name, friends)
self.admin = True

@ -0,0 +1,18 @@
from matplotlib import pyplot as plt
movies = ["Annie Hall", "Ben-hur", "Casablanca", "Gandhi", "West Side Story"]
num_oscars = [5, 11, 3, 8, 10]
# Bars are by default 0.8 width. We are adding 0.1 to the left coordinates so that all bars are centered
xs = [i + 0.1 for i, _ in enumerate(movies)]
# Plot the bars with left-x coordinates and heights (num_oscars)
plt.bar(xs, num_oscars)
plt.ylabel("# of Oscars")
plt.title("Movies")
# label x-axis with movie names at bar centers
plt.xticks([i + 0.1 for i, _ in enumerate(movies)], movies)
plt.show()

@ -0,0 +1,11 @@
from matplotlib import pyplot as plt
years = range(1950, 2020, 10)
gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14958.3]
plt.plot(years, gdp, marker="o", color="green", linestyle="solid")
plt.title("Nominal GDP")
plt.ylabel("Billions of $")
plt.show()

@ -0,0 +1,2 @@
matplotlib==3.0.2
black
Loading…
Cancel
Save