Data Scientist - Guía de Entorno¶
Guía para configurar el entorno de Data Science con Claude Code.
Resumen de Capacidades¶
| Capacidad | Herramientas |
|---|---|
| Notebooks | Jupyter, VS Code, Google Colab |
| Data Analysis | Pandas, Polars, NumPy |
| Visualization | Matplotlib, Seaborn, Plotly |
| Machine Learning | scikit-learn, XGBoost, LightGBM |
| Deep Learning | PyTorch, TensorFlow, JAX |
| MLOps | MLflow, Weights & Biases |
Python Environment¶
# Instalar Python
brew install python@3.12 # macOS
winget install Python.Python.3.12 # Windows
# Conda (recomendado para Data Science)
# Instalar Miniconda: https://docs.conda.io/en/latest/miniconda.html
# Crear environment
conda create -n ds python=3.12
conda activate ds
# O con uv (más rápido)
pip install uv
uv venv
source .venv/bin/activate
Jupyter¶
JupyterLab¶
pip install jupyterlab
# Ejecutar
jupyter lab
# Con extensiones
pip install jupyterlab-git jupyterlab-lsp
VS Code Notebooks¶
# Extensión
code --install-extension ms-toolsai.jupyter
# Kernel
pip install ipykernel
python -m ipykernel install --user --name=ds --display-name="Data Science"
Data Analysis Stack¶
Instalación¶
Pandas¶
import pandas as pd
import numpy as np
# Leer datos
df = pd.read_csv("data.csv")
df = pd.read_parquet("data.parquet")
df = pd.read_sql("SELECT * FROM table", connection)
# Exploración
df.head()
df.info()
df.describe()
df.shape
df.dtypes
# Transformaciones
df['new_col'] = df['col1'] + df['col2']
df = df.dropna()
df = df.fillna(0)
df_grouped = df.groupby('category').agg({'value': ['mean', 'sum', 'count']})
# Merge/Join
merged = pd.merge(df1, df2, on='key', how='left')
Polars (Alternativa rápida)¶
import polars as pl
# Leer datos (lazy evaluation)
df = pl.scan_parquet("data/*.parquet")
# Transformaciones
result = (
df
.filter(pl.col("status") == "active")
.with_columns([
pl.col("date").dt.year().alias("year"),
(pl.col("value") * 1.1).alias("adjusted_value")
])
.group_by("category")
.agg([
pl.count().alias("n"),
pl.col("value").mean().alias("avg_value"),
pl.col("value").std().alias("std_value")
])
.sort("avg_value", descending=True)
.collect()
)
Visualization¶
Instalación¶
Matplotlib + Seaborn¶
import matplotlib.pyplot as plt
import seaborn as sns
# Configurar estilo
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
# Histograma
fig, ax = plt.subplots()
sns.histplot(data=df, x="value", hue="category", ax=ax)
plt.title("Distribution by Category")
plt.savefig("histogram.png", dpi=300, bbox_inches='tight')
# Scatter con regresión
sns.lmplot(data=df, x="feature1", y="target", hue="category")
# Heatmap de correlación
correlation = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation, annot=True, cmap="coolwarm", center=0)
plt.show()
Plotly (Interactivo)¶
import plotly.express as px
import plotly.graph_objects as go
# Scatter interactivo
fig = px.scatter(
df,
x="feature1",
y="target",
color="category",
size="value",
hover_data=["name"],
title="Feature Analysis"
)
fig.show()
# Dashboard con subplots
from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=2)
fig.add_trace(go.Scatter(x=df['x'], y=df['y']), row=1, col=1)
fig.add_trace(go.Histogram(x=df['value']), row=1, col=2)
fig.update_layout(height=600)
fig.show()
Machine Learning¶
scikit-learn¶
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Preparar datos
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
# Entrenar
pipeline.fit(X_train, y_train)
# Evaluar
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# Cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f"CV Accuracy: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")
XGBoost¶
import xgboost as xgb
from sklearn.metrics import mean_squared_error
# DMatrix (formato optimizado)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# Parámetros
params = {
'objective': 'reg:squarederror',
'max_depth': 6,
'learning_rate': 0.1,
'n_estimators': 100,
'early_stopping_rounds': 10
}
# Entrenar con early stopping
model = xgb.train(
params,
dtrain,
num_boost_round=1000,
evals=[(dtest, 'test')],
early_stopping_rounds=10
)
# Predicción
y_pred = model.predict(dtest)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse:.4f}")
# Feature importance
xgb.plot_importance(model)
Deep Learning¶
PyTorch¶
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# Modelo
class NeuralNetwork(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(input_size, hidden_size),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(hidden_size, output_size)
)
def forward(self, x):
return self.layers(x)
# Datos
X_tensor = torch.FloatTensor(X_train.values)
y_tensor = torch.FloatTensor(y_train.values)
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# Entrenamiento
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralNetwork(X_train.shape[1], 64, 1).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(100):
for batch_X, batch_y in dataloader:
batch_X, batch_y = batch_X.to(device), batch_y.to(device)
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs.squeeze(), batch_y)
loss.backward()
optimizer.step()
if epoch % 10 == 0:
print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
MLOps¶
MLflow¶
import mlflow
import mlflow.sklearn
# Configurar tracking
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("my_experiment")
# Log experimento
with mlflow.start_run():
# Parámetros
mlflow.log_params({
"n_estimators": 100,
"max_depth": 6,
"learning_rate": 0.1
})
# Entrenar modelo
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
# Métricas
accuracy = model.score(X_test, y_test)
mlflow.log_metric("accuracy", accuracy)
# Artifacts
mlflow.sklearn.log_model(model, "model")
mlflow.log_artifact("feature_importance.png")
# Cargar modelo
model_uri = "runs:/<run_id>/model"
loaded_model = mlflow.sklearn.load_model(model_uri)
Weights & Biases¶
import wandb
# Inicializar
wandb.init(project="my_project", config={
"learning_rate": 0.01,
"epochs": 100,
"batch_size": 32
})
# Log durante entrenamiento
for epoch in range(100):
loss = train_epoch()
wandb.log({"epoch": epoch, "loss": loss})
# Log métricas finales
wandb.log({
"accuracy": accuracy,
"f1_score": f1,
"confusion_matrix": wandb.plot.confusion_matrix(
y_true=y_test, preds=y_pred, class_names=class_names
)
})
wandb.finish()
Hyperparameter Tuning¶
Optuna¶
import optuna
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 500),
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
}
model = xgb.XGBClassifier(**params, random_state=42)
score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
return score.mean()
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(f"Best params: {study.best_params}")
print(f"Best accuracy: {study.best_value:.4f}")
Comandos que Claude Code Ejecutará¶
# Jupyter
jupyter lab
jupyter notebook
# MLflow
mlflow server
mlflow ui
mlflow models serve -m runs:/<run_id>/model
# Weights & Biases
wandb login
wandb sync
# Análisis
python -c "import pandas; print(pandas.read_csv('data.csv').describe())"
VS Code Extensions¶
code --install-extension ms-toolsai.jupyter
code --install-extension ms-toolsai.vscode-ai
code --install-extension ms-python.python
code --install-extension ms-python.vscode-pylance
Verificación del Entorno¶
#!/bin/bash
echo "=== Verificación Entorno Data Science ==="
echo -e "\n--- Python ---"
python --version
conda --version 2>/dev/null || echo "Conda no instalado"
echo -e "\n--- Paquetes Core ---"
python -c "import pandas; print(f'Pandas: {pandas.__version__}')" 2>/dev/null || echo "Pandas no instalado"
python -c "import numpy; print(f'NumPy: {numpy.__version__}')" 2>/dev/null || echo "NumPy no instalado"
python -c "import sklearn; print(f'scikit-learn: {sklearn.__version__}')" 2>/dev/null || echo "scikit-learn no instalado"
echo -e "\n--- Deep Learning ---"
python -c "import torch; print(f'PyTorch: {torch.__version__}')" 2>/dev/null || echo "PyTorch no instalado"
python -c "import tensorflow; print(f'TensorFlow: {tensorflow.__version__}')" 2>/dev/null || echo "TensorFlow no instalado"
echo -e "\n--- MLOps ---"
mlflow --version 2>/dev/null || echo "MLflow no instalado"
echo -e "\n=== Verificación Completa ==="