import numpy as np import pandas def normalize_features(df): """ Normalize the features in the data set. """ mu = df.mean() sigma = df.std() if (sigma == 0).any(): raise Exception("One or more features had the same value for all samples, and thus could " + \ "not be normalized. Please do not include features with only a single value " + \ "in your model.") df_normalized = (df - df.mean()) / df.std() return df_normalized, mu, sigma def compute_cost(features, values, theta): """ Compute the cost function given a set of features / values, and the values for our thetas. """ m = len(values) sse = np.square(np.dot(features, theta) - values).sum() cost = sse / (2*m) return cost def gradient_descent(features, values, theta, alpha, num_iterations): """ Perform gradient descent given a data set with an arbitrary number of features. """ m = len(values) cost_history = [] for i in range(num_iterations): predicted_values = np.dot(features, theta) theta = theta + alpha/m * np.dot((values - predicted_values), features) cost_history.append(compute_cost(features, values, theta)) return theta, pandas.Series(cost_history) def predictions(dataframe): ''' The NYC turnstile data is stored in a pandas dataframe called weather_turnstile. Using the information stored in the dataframe, let's predict the ridership of the NYC subway using linear regression with gradient descent. You can download the complete turnstile weather dataframe here: https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv Your prediction should have a R^2 value of 0.40 or better. You need to experiment using various input features contained in the dataframe. We recommend that you don't use the EXITSn_hourly feature as an input to the linear model because we cannot use it as a predictor: we cannot use exits counts as a way to predict entry counts. Note: Due to the memory and CPU limitation of our Amazon EC2 instance, we will give you a random subet (~15%) of the data contained in turnstile_data_master_with_weather.csv. You are encouraged to experiment with this computer on your own computer, locally. If you'd like to view a plot of your cost history, uncomment the call to plot_cost_history below. The slowdown from plotting is significant, so if you are timing out, the first thing to do is to comment out the plot command again. If you receive a "server has encountered an error" message, that means you are hitting the 30-second limit that's placed on running your program. Try using a smaller number for num_iterations if that's the case. If you are using your own algorithm/models, see if you can optimize your code so that it runs faster. ''' features = dataframe[['rain', 'precipi', 'Hour', 'meantempi', 'mintempi', 'meanwindspdi']] # Add UNIT to features using dummy variables dummy_units = pandas.get_dummies(dataframe['UNIT'], prefix='unit') features = features.join(dummy_units) # Values values = dataframe['ENTRIESn_hourly'] m = len(values) features, mu, sigma = normalize_features(features) features['ones'] = np.ones(m) # Add a column of 1s (y intercept) # Convert features and values to numpy arrays features_array = np.array(features) values_array = np.array(values) # Set values for alpha, number of iterations. alpha = 0.1 # please feel free to change this value num_iterations = 75 # please feel free to change this value # Initialize theta, perform gradient descent theta_gradient_descent = np.zeros(len(features.columns)) theta_gradient_descent, cost_history = gradient_descent(features_array, values_array, theta_gradient_descent, alpha, num_iterations) plot = None predictions = np.dot(features_array, theta_gradient_descent) return predictions, plot
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question