Saltar a contenido

Data Scientist - Guía de Entorno

Guía para configurar el entorno de Data Science con Claude Code.

Resumen de Capacidades

Capacidad Herramientas
Notebooks Jupyter, VS Code, Google Colab
Data Analysis Pandas, Polars, NumPy
Visualization Matplotlib, Seaborn, Plotly
Machine Learning scikit-learn, XGBoost, LightGBM
Deep Learning PyTorch, TensorFlow, JAX
MLOps MLflow, Weights & Biases

Python Environment

# Instalar Python
brew install python@3.12  # macOS
winget install Python.Python.3.12  # Windows

# Conda (recomendado para Data Science)
# Instalar Miniconda: https://docs.conda.io/en/latest/miniconda.html

# Crear environment
conda create -n ds python=3.12
conda activate ds

# O con uv (más rápido)
pip install uv
uv venv
source .venv/bin/activate

Jupyter

JupyterLab

pip install jupyterlab

# Ejecutar
jupyter lab

# Con extensiones
pip install jupyterlab-git jupyterlab-lsp

VS Code Notebooks

# Extensión
code --install-extension ms-toolsai.jupyter

# Kernel
pip install ipykernel
python -m ipykernel install --user --name=ds --display-name="Data Science"

Data Analysis Stack

Instalación

pip install pandas numpy scipy polars pyarrow

Pandas

import pandas as pd
import numpy as np

# Leer datos
df = pd.read_csv("data.csv")
df = pd.read_parquet("data.parquet")
df = pd.read_sql("SELECT * FROM table", connection)

# Exploración
df.head()
df.info()
df.describe()
df.shape
df.dtypes

# Transformaciones
df['new_col'] = df['col1'] + df['col2']
df = df.dropna()
df = df.fillna(0)
df_grouped = df.groupby('category').agg({'value': ['mean', 'sum', 'count']})

# Merge/Join
merged = pd.merge(df1, df2, on='key', how='left')

Polars (Alternativa rápida)

import polars as pl

# Leer datos (lazy evaluation)
df = pl.scan_parquet("data/*.parquet")

# Transformaciones
result = (
    df
    .filter(pl.col("status") == "active")
    .with_columns([
        pl.col("date").dt.year().alias("year"),
        (pl.col("value") * 1.1).alias("adjusted_value")
    ])
    .group_by("category")
    .agg([
        pl.count().alias("n"),
        pl.col("value").mean().alias("avg_value"),
        pl.col("value").std().alias("std_value")
    ])
    .sort("avg_value", descending=True)
    .collect()
)

Visualization

Instalación

pip install matplotlib seaborn plotly altair

Matplotlib + Seaborn

import matplotlib.pyplot as plt
import seaborn as sns

# Configurar estilo
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# Histograma
fig, ax = plt.subplots()
sns.histplot(data=df, x="value", hue="category", ax=ax)
plt.title("Distribution by Category")
plt.savefig("histogram.png", dpi=300, bbox_inches='tight')

# Scatter con regresión
sns.lmplot(data=df, x="feature1", y="target", hue="category")

# Heatmap de correlación
correlation = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation, annot=True, cmap="coolwarm", center=0)

plt.show()

Plotly (Interactivo)

import plotly.express as px
import plotly.graph_objects as go

# Scatter interactivo
fig = px.scatter(
    df,
    x="feature1",
    y="target",
    color="category",
    size="value",
    hover_data=["name"],
    title="Feature Analysis"
)
fig.show()

# Dashboard con subplots
from plotly.subplots import make_subplots

fig = make_subplots(rows=2, cols=2)
fig.add_trace(go.Scatter(x=df['x'], y=df['y']), row=1, col=1)
fig.add_trace(go.Histogram(x=df['value']), row=1, col=2)
fig.update_layout(height=600)
fig.show()

Machine Learning

scikit-learn

pip install scikit-learn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Preparar datos
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Entrenar
pipeline.fit(X_train, y_train)

# Evaluar
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f"CV Accuracy: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")

XGBoost

pip install xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# DMatrix (formato optimizado)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Parámetros
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'early_stopping_rounds': 10
}

# Entrenar con early stopping
model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtest, 'test')],
    early_stopping_rounds=10
)

# Predicción
y_pred = model.predict(dtest)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse:.4f}")

# Feature importance
xgb.plot_importance(model)

Deep Learning

PyTorch

pip install torch torchvision torchaudio
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Modelo
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        return self.layers(x)

# Datos
X_tensor = torch.FloatTensor(X_train.values)
y_tensor = torch.FloatTensor(y_train.values)
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Entrenamiento
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralNetwork(X_train.shape[1], 64, 1).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    for batch_X, batch_y in dataloader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

MLOps

MLflow

pip install mlflow
import mlflow
import mlflow.sklearn

# Configurar tracking
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("my_experiment")

# Log experimento
with mlflow.start_run():
    # Parámetros
    mlflow.log_params({
        "n_estimators": 100,
        "max_depth": 6,
        "learning_rate": 0.1
    })

    # Entrenar modelo
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)

    # Métricas
    accuracy = model.score(X_test, y_test)
    mlflow.log_metric("accuracy", accuracy)

    # Artifacts
    mlflow.sklearn.log_model(model, "model")
    mlflow.log_artifact("feature_importance.png")

# Cargar modelo
model_uri = "runs:/<run_id>/model"
loaded_model = mlflow.sklearn.load_model(model_uri)
# Servidor MLflow
mlflow server --host 0.0.0.0 --port 5000

# UI
mlflow ui

Weights & Biases

pip install wandb
import wandb

# Inicializar
wandb.init(project="my_project", config={
    "learning_rate": 0.01,
    "epochs": 100,
    "batch_size": 32
})

# Log durante entrenamiento
for epoch in range(100):
    loss = train_epoch()
    wandb.log({"epoch": epoch, "loss": loss})

# Log métricas finales
wandb.log({
    "accuracy": accuracy,
    "f1_score": f1,
    "confusion_matrix": wandb.plot.confusion_matrix(
        y_true=y_test, preds=y_pred, class_names=class_names
    )
})

wandb.finish()

Hyperparameter Tuning

Optuna

pip install optuna
import optuna

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
    }

    model = xgb.XGBClassifier(**params, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return score.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print(f"Best params: {study.best_params}")
print(f"Best accuracy: {study.best_value:.4f}")

Comandos que Claude Code Ejecutará

# Jupyter
jupyter lab
jupyter notebook

# MLflow
mlflow server
mlflow ui
mlflow models serve -m runs:/<run_id>/model

# Weights & Biases
wandb login
wandb sync

# Análisis
python -c "import pandas; print(pandas.read_csv('data.csv').describe())"

VS Code Extensions

code --install-extension ms-toolsai.jupyter
code --install-extension ms-toolsai.vscode-ai
code --install-extension ms-python.python
code --install-extension ms-python.vscode-pylance

Verificación del Entorno

#!/bin/bash
echo "=== Verificación Entorno Data Science ==="

echo -e "\n--- Python ---"
python --version
conda --version 2>/dev/null || echo "Conda no instalado"

echo -e "\n--- Paquetes Core ---"
python -c "import pandas; print(f'Pandas: {pandas.__version__}')" 2>/dev/null || echo "Pandas no instalado"
python -c "import numpy; print(f'NumPy: {numpy.__version__}')" 2>/dev/null || echo "NumPy no instalado"
python -c "import sklearn; print(f'scikit-learn: {sklearn.__version__}')" 2>/dev/null || echo "scikit-learn no instalado"

echo -e "\n--- Deep Learning ---"
python -c "import torch; print(f'PyTorch: {torch.__version__}')" 2>/dev/null || echo "PyTorch no instalado"
python -c "import tensorflow; print(f'TensorFlow: {tensorflow.__version__}')" 2>/dev/null || echo "TensorFlow no instalado"

echo -e "\n--- MLOps ---"
mlflow --version 2>/dev/null || echo "MLflow no instalado"

echo -e "\n=== Verificación Completa ==="

Recursos