import React, { Component } from 'react';
import { Button, Icon } from "semantic-ui-react";
import { connect } from "react-redux";
import ReactGA4 from 'react-ga4';
import {
    changeCurrentSection,
} from "../../../actions/generalActions";
import arrowLeft from '../../images/arrow-left.svg';
import calendar from '../../images/calendar.svg';
import { Helmet } from 'react-helmet';
import ReactPlayer from 'react-player';

const texts = [
    "dotnet --version",
    `dotnet new console -n OllamaScraperApp
cd OllamaScraperApp`,
    `class Program
{
    static async Task Main(string[] args)
    {
        string url = "https://ollama.com/library/llama3.2";
        using HttpClient client = new HttpClient();
        var response = await client.GetStringAsync(url);

        Console.WriteLine(response); // Display raw HTML for debugging
        // Here, you can use regex or an HTML parser to extract meaningful content
    }
}`,
    "dotnet add package HtmlAgilityPack",
    `using HtmlAgilityPack;

static string ExtractContent(string html)
{
    var doc = new HtmlDocument();
    doc.LoadHtml(html);

    // Extract content, e.g., all paragraphs
    var paragraphs = doc.DocumentNode.SelectNodes("//p");
    return string.Join('\n', paragraphs.Select(p => p.InnerText.Trim()));
}
`,
    `using System.Diagnostics;

static string AnalyzeContentWithOllama(string content)
{
    var process = new Process
    {
        StartInfo = new ProcessStartInfo
        {
            FileName = "ollama",
            Arguments = $"run llama3 'Summarize this: {content}'",
            RedirectStandardOutput = true,
            UseShellExecute = false,
            CreateNoWindow = true
        }
    };

    process.Start();
    string result = process.StandardOutput.ReadToEnd();
    process.WaitForExit();

    return result;
}`];

const delay = ms => new Promise(res => setTimeout(res, ms));

const llaveizquierda = "{";
const llavederecha = "}";
const parentesisizquierdo = "(";
const parentesisderecho = ")";
const mayor = ">";

class OllamaPlusNETScraper extends Component {
    constructor(props) {
        super(props);

        this.state = {
            codeCopied: 0
        }

        this.changeSection = this.changeSection.bind(this);
        this.onClickCopy = this.onClickCopy.bind(this);
    }

    changeSection() {
        this.props.changeCurrentSection("Blog");
        if (this.props.cookieUp)
            ReactGA4.send({ hitType: "pageview", title: "Blog", page: '/Blog' });
    }

    async onClickCopy(codeCopied, isJson) {
        if (isJson)
            navigator.clipboard.writeText(JSON.stringify(texts[codeCopied - 1], null, 2));
        else
            navigator.clipboard.writeText(texts[codeCopied - 1]);

        this.setState({ codeCopied: codeCopied });

        await delay(2000);

        this.setState({ codeCopied: 0 });
    }

    render() {
        return (
            <>
                <Helmet>
                    <title>Building a Web Scraper with .NET and Analyzing Content Using Ollama</title>
                    <meta name="description" content="Learn how to build a .NET-based web scraper and integrate it with Ollama for powerful content analysis." />
                    <meta name="keywords" content="ollama, .NET, C#, AI, NLP, console application, language model, artificial intelligence" />
                    <meta property="og:title" content="Building a Web Scraper with .NET and Analyzing Content Using Ollama" />
                    <meta property="og:description" content="Learn how to build a .NET-based web scraper and integrate it with Ollama for powerful content analysis." />
                    <meta property="og:type" content="article" />
                    <meta property="article:published_time" content="2024-11-19" />
                </Helmet>
                <section class="article-section">
                    <div className='article-container'>
                        <div className='article-header'>
                            <a href='/blog'>
                                <div className='back-button-container'>
                                    <img src={arrowLeft} />
                                    <label>Back to Articles</label>
                                </div>
                            </a>
                        </div>
                        <div className='article'>
                            <h1>Building a Web Scraper with .NET and Analyzing Content Using Ollama</h1>
                            <div className='author-date-container'>
                                {/* <div className='general-container'>
                            <img src={user} />
                            <label>Jonathan Confenti</label>
                          </div> */}
                                <div className='general-container'>
                                    <img src={calendar} />
                                    <label>November 19, 2024</label>
                                </div>
                            </div>
                            <div className='article-div'>
                                <p>
                                    In this article, we’ll cover how to build a simple web scraper in .NET and integrate Ollama to process and analyze the extracted content.
                                    This combination of tools enables us to gather real-time data and use a language model to gain insights, making it ideal for applications
                                    like summarization, content analysis, or sentiment analysis.
                                </p>
                                <h2>Prerequisites</h2>
                                <div className='step'>
                                    <p className='mb-05'>
                                        Before you begin, make sure you have the following installed:
                                    </p>
                                    <ul class="custom-bullets">
                                        <li>.NET SDK (version 8.0)</li>
                                        <li>Visual Studio or any editor of your choice</li>
                                        <li>Ollama installed on your machine. For detailed instructions, refer to our previous article, <a href='/article#install-and-run-ollama-net'>Getting started with Ollama in .NET</a></li>
                                    </ul>
                                    <p>You can use the following command to verify the installed version of .NET.</p>
                                    <div className='bash'>
                                        <div className='bash-header'>
                                            <label>bash</label>
                                            <Button onClick={() => this.onClickCopy(1, false)}>
                                                <Icon className={this.state.codeCopied == 1 ? 'check' : ''} name={this.state.codeCopied == 1 ? 'check' : 'copy outline'} />
                                                <p>{this.state.codeCopied == 1 ? ' Copied!' : 'Copy'}</p>
                                            </Button>
                                        </div>
                                        <div className='bash-body'>
                                            <p>dotnet --version</p>
                                        </div>
                                    </div>
                                </div>
                                <h3>Step 1: Setting Up a Simple Web Scraper in .NET</h3>
                                <div className='step'>
                                    <p>
                                        We’ll start by building a basic web scraper using .NET’s <span>HttpClient</span>.
                                        This scraper will fetch content from a web page and extract relevant information.
                                    </p>
                                    <p>Create a new .NET console application:</p>
                                    <div className='bash mt-10'>
                                        <div className='bash-header'>
                                            <label>bash</label>
                                            <Button onClick={() => this.onClickCopy(2, false)}>
                                                <Icon className={this.state.codeCopied == 2 ? 'check' : ''} name={this.state.codeCopied == 2 ? 'check' : 'copy outline'} />
                                                <p>{this.state.codeCopied == 2 ? ' Copied!' : 'Copy'}</p>
                                            </Button>
                                        </div>
                                        <div className='bash-body'>
                                            <p>dotnet new console -n OllamaScraperApp</p>
                                            <p><span className='orange'>cd</span> OllamaScraperApp</p>
                                        </div>
                                    </div>
                                    <p>Inside the <span>Program.cs</span> file, add the following code to fetch content from a web page:</p>
                                    <div className='bash'>
                                        <div className='bash-header'>
                                            <label>bash</label>
                                            <Button onClick={() => this.onClickCopy(3, false)}>
                                                <Icon className={this.state.codeCopied == 3 ? 'check' : ''} name={this.state.codeCopied == 3 ? 'check' : 'copy outline'} />
                                                <p>{this.state.codeCopied == 3 ? ' Copied!' : 'Copy'}</p>
                                            </Button>
                                        </div>
                                        <div className='bash-body'>
                                            <p><span className='blue'>class </span> <span className='red'>Program</span></p>
                                            <p>{llaveizquierda}</p>
                                            <p className='one-space'><span className='blue'>static async</span> Task <span className='red'>Main</span>{parentesisizquierdo}<span className='orange'>string</span>[] args{llavederecha}</p>
                                            <p className='one-space'>{llaveizquierda}</p>
                                            <p className='two-spaces'><span className='orange'>string</span> url = <span className='green'>"https://ollama.com/library/llama3.2"</span>;</p>
                                            <p className='two-spaces'><span className='blue'>using</span> HttpClient client = <span className='blue'>new</span> HttpClient();</p>
                                            <p className='two-spaces'><span className='blue'>var</span> response = <span className='blue'>await</span> client.GetStringAsync(url);</p>
                                            <br></br>
                                            <p className='two-spaces'>Console.WriteLine(response); <span className='grey'>// Display raw HTML for debugging</span></p>
                                            <p className='two-spaces'><span className='grey'>// Here, you can use regex or an HTML parser to extract meaningful content</span></p>
                                            <p className='one-space'>{llavederecha}</p>
                                            <p>{llavederecha}</p>
                                        </div>
                                    </div>
                                </div>
                                <h3>Step 2: Extract Meaningful Content</h3>
                                <div className='step'>
                                    <p>
                                        Once you have the page’s HTML, parse it to extract relevant data.
                                        For a simple approach, you can use regex, but for more complex scenarios, consider using a library like HtmlAgilityPack.
                                    </p>
                                    <p>Install HtmlAgilityPack by running:</p>
                                    <div className='bash'>
                                        <div className='bash-header'>
                                            <label>bash</label>
                                            <Button onClick={() => this.onClickCopy(4, false)}>
                                                <Icon className={this.state.codeCopied == 4 ? 'check' : ''} name={this.state.codeCopied == 4 ? 'check' : 'copy outline'} />
                                                <p>{this.state.codeCopied == 4 ? ' Copied!' : 'Copy'}</p>
                                            </Button>
                                        </div>
                                        <div className='bash-body'>
                                            <p>dotnet add package HtmlAgilityPack</p>
                                        </div>
                                    </div>
                                    <p>Modify <span>Program.cs</span> to extract specific elements:</p>
                                    <div className='bash'>
                                        <div className='bash-header'>
                                            <label>bash</label>
                                            <Button onClick={() => this.onClickCopy(5, false)}>
                                                <Icon className={this.state.codeCopied == 5 ? 'check' : ''} name={this.state.codeCopied == 5 ? 'check' : 'copy outline'} />
                                                <p>{this.state.codeCopied == 5 ? ' Copied!' : 'Copy'}</p>
                                            </Button>
                                        </div>
                                        <div className='bash-body'>
                                            <p><span className='blue'>using</span> HtmlAgilityPack;</p>
                                            <br></br>
                                            <p><span className='blue'>static</span> <span className='orange'>string</span> <span className='red'>ExtractContent</span>{parentesisizquierdo}<span className='orange'>string</span> html{parentesisderecho}</p>
                                            <p>{llaveizquierda}</p>
                                            <p className='one-space'><span className='blue'>var</span> doc = <span className='blue'>new</span> HtmlDocument();</p>
                                            <p className='one-space'>doc.LoadHtml(html);</p>
                                            <br></br>
                                            <p className='one-space'><span className='grey'>// Extract content, e.g., all paragraphs</span></p>
                                            <p className='one-space'><span className='blue'>var</span> paragraphs = doc.DocumentNode.SelectNodes{parentesisizquierdo}<span className='green'>"//p"</span>{parentesisderecho};</p>
                                            <p className='one-space'><span className='blue'>return</span> <span className='orange'>string</span>.Join{parentesisizquierdo}<span className='green'>"\n"</span>, paragraphs.Select(p ={mayor} p.InnerText.Trim()){parentesisderecho};</p>
                                            <p>{llavederecha}</p>
                                        </div>
                                    </div>
                                </div>
                                <h3>Step 3: Analyzing Content with Ollama</h3>
                                <div className='step'>
                                    <p>
                                        Now that we have the content, let's pass it to Ollama for analysis.
                                        Use Ollama’s capabilities to summarize, analyze sentiment, or answer questions based on the content.
                                    </p>
                                    <p>Add code to run an Ollama model, as shown in previous articles:</p>
                                    <div className='bash'>
                                        <div className='bash-header'>
                                            <label>csharp</label>
                                            <Button onClick={() => this.onClickCopy(6, false)}>
                                                <Icon className={this.state.codeCopied == 6 ? 'check' : ''} name={this.state.codeCopied == 6 ? 'check' : 'copy outline'} />
                                                <p>{this.state.codeCopied == 6 ? ' Copied!' : 'Copy'}</p>
                                            </Button>
                                        </div>
                                        <div className='bash-body'>
                                            <p><span className='blue'>using</span> System.Diagnostics;</p>
                                            <br></br>
                                            <p><span className='blue'>static</span> <span className='orange'>string</span> <span className='red'>AnalyzeContentWithOllama</span>{parentesisizquierdo}<span className='orange'>string</span> content{parentesisderecho}</p>
                                            <p>{llaveizquierda}</p>
                                            <p className='one-space'><span className='blue'>var</span> process = <span className='blue'>new</span> Process</p>
                                            <p className='one-space'>{llaveizquierda}</p>
                                            <p className='two-spaces'>StartInfo = <span className='blue'>new</span> ProcessStartInfo</p>
                                            <p className='two-spaces'>{llaveizquierda}</p>
                                            <p className='three-spaces'>FileName = <span className='green'>"ollama"</span>,</p>
                                            <p className='three-spaces'>Arguments = <span className='green'>$"run llama3 'Summarize this: {llaveizquierda}content{llavederecha}'"</span>,</p>
                                            <p className='three-spaces'>RedirectStandardOutput = <span className='blue'>true</span>,</p>
                                            <p className='three-spaces'>UseShellExecute = <span className='blue'>false</span>,</p>
                                            <p className='three-spaces'>CreateNoWindow = <span className='blue'>true</span></p>
                                            <p className='two-spaces'>{llavederecha}</p>
                                            <p className='one-space'>{llavederecha};</p>
                                            <br></br>
                                            <p className='one-space'>process.Start();</p>
                                            <p className='one-space'><span className='orange'>string</span> result = process.StandardOutput.ReadToEnd();</p>
                                            <p className='one-space'>process.WaitForExit();</p>
                                            <br></br>
                                            <p className='one-space'><span className='blue'>return</span> result;</p>
                                            <p>{llavederecha}</p>
                                        </div>
                                    </div>
                                    <p>Call this method after extracting content to analyze it.</p>
                                </div>
                                <h3>Step 4: Testing and Enhancements</h3>
                                <div className='step'>
                                    <p>Run your application to see the output, then consider adding more functionality, such as:</p>
                                    <ul class="custom-bullets">
                                        <li>Setting up multiple endpoints to scrape and analyze different sources.</li>
                                        <li>Enhancing data processing for large or structured pages.</li>
                                        <li>Using Ollama to extract insights, compare data, or generate summaries across multiple pages.</li>
                                    </ul>
                                </div>
                                <h3>Demo Video</h3>
                                <div className='step'>
                                    <p>To make the implementation clearer, here is a demonstration video showcasing the execution and results of the application:</p>
                                    <ReactPlayer
                                        url='../../ollama-scraper.mp4'
                                        poster="../../portada3.png"
                                        controls={true}
                                        width='100%'
                                        height='auto'
                                        style={{
                                            borderRadius: '6px',
                                            background: 'linear-gradient(145deg, rgba(255,255,255,0.1) 0%, rgba(0,0,0,0.1) 100%)'
                                        }}
                                        config={{
                                            file: {
                                                attributes: {
                                                    poster: '../../portada3.png'
                                                }
                                            }
                                        }}
                                    />
                                </div>
                                <h3>Conclusion</h3>
                                <div className='step'>
                                    <p>
                                        In this article, we’ve built a basic scraper in .NET and integrated Ollama for content analysis.
                                        This setup provides a foundation for applications that need to gather and process information automatically,
                                        making it a powerful tool for research, market analysis, or content curation.
                                    </p>
                                </div>
                                <h3>Source Code</h3>
                                <div className='last-step'>
                                    <p>
                                        You can find the complete source code for this project on GitHub. Feel free to explore, clone, and modify it for your own use: <a href='https://github.com/coowise/OllamaScraperApp' target='_blank'>GitHub Repository: OllamaScraperApp</a>
                                    </p>
                                    <p>
                                        This repository includes all the files and configurations used in this tutorial, making it easy for you to replicate and extend the application.
                                    </p>
                                </div>
                            </div>
                            <div className='footer'>
                                <span>AI Development</span>
                            </div>
                        </div>
                    </div>
                </section>
            </>
        );
    }
}

const mapStateToProps = (value) => {
    return {
        language: value.general.language,

        currentSection: value.general.currentSection,

        cookieUp: value.general.cookieUp
    };
}

const mapDispatchToProps = (dispatch) => {
    return {
        changeCurrentSection: (currentSection) => dispatch(changeCurrentSection(currentSection))
    }
}

export default connect(mapStateToProps, mapDispatchToProps)(OllamaPlusNETScraper);